Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c

f60a0ac72c Merge pull request ComputationalRadiationPhysics#155 from ComputationalRadiationPhysics/dev 4c73dde10d Merge pull request ComputationalRadiationPhysics#152 from psychocoderHPC/topic-changeLogVersionIncreaseTo0.2.0 fc428aa8e9 update release date for 0.2.0 1e32118c80 Merge pull request ComputationalRadiationPhysics#154 from sbastrakov/topic-addInt3Float3 84008fb6ae Add float3 and int3 with make-functions 4d5eace02a Merge pull request ComputationalRadiationPhysics#153 from sbastrakov/topic-finalizeBlackScholes2 bc35f6a822 Add Black-Scholes example 4b1ab5722f Merge pull request ComputationalRadiationPhysics#146 from psychocoderHPC/topic-changeLogVersionIncreaseTo0.2.0 52ddba6b26 changelog and version increase to 0.2.0 aabb8d5339 Merge pull request ComputationalRadiationPhysics#145 from psychocoderHPC/fix-minimalCMakeRequirements 7795e58c0c Merge pull request ComputationalRadiationPhysics#147 from psychocoderHPC/topic-updateAuthorSection 974afe8451 Merge pull request ComputationalRadiationPhysics#150 from sbastrakov/doc-clarifyThreadUnsafety caf4fdda1e Extend the docs with thread unsafety 91d78da7eb Merge pull request ComputationalRadiationPhysics#149 from psychocoderHPC/topic-setTravisCmakeToMinimumRequiredVersion a3bc920dfa use CMake 3.11.4 for travis tests 9ccab281e9 update author section 6bc13a220c Merge pull request ComputationalRadiationPhysics#148 from psychocoderHPC/topic-cuplaLogo 406fb5b93a update minimal CMake requirements to 3.11.4 9c32408da3 add cupla logo cd6b9859d4 Merge pull request ComputationalRadiationPhysics#144 from psychocoderHPC/topic-hipPinnedMemory c9fb7dd047 pin memory allocated with `cuplaMallocHoast` 92f8c82474 Merge pull request ComputationalRadiationPhysics#143 from psychocoderHPC/topic-updateAlpakaTo0.4.0 27f0bca235 Merge commit '0f175188a374a0194145728f2083256e1eb14b13' into topic-updateAlpakaTo0.4.0 0f175188a3 Squashed 'alpaka/' changes from d5e59590f..ab0b8a460 1e4aea9747 Merge pull request ComputationalRadiationPhysics#142 from sbastrakov/fix-typeCastWarnings 8f453a4bf5 Fix warnings concerning type casting 08f80ef24b Merge pull request ComputationalRadiationPhysics#141 from sbastrakov/topic-extendAtomicComments b3d34b28ab Extend comments of atomic functions git-subtree-dir: thirdParty/cupla git-subtree-split: f60a0ac72cb175d4a8a67301882f7a6b1de4c3c3
BrianMarre · Apr 29, 2020 · 94b25fb · 94b25fb
1 parent 0dea7e2
commit 94b25fb
Show file tree

Hide file tree

Showing 21 changed files with 1,920 additions and 1 deletion.
diff --git a/.travis.yml b/.travis.yml
@@ -32,12 +32,26 @@ install:
   #############################################################################
   # PMacc CPU-only dependencies                                               #
   #############################################################################
+<<<<<<< HEAD
   - SPACK_FOUND=$(which spack >/dev/null && { echo 0; } || { echo 1; })
   - if [ $SPACK_FOUND -ne 0 ]; then
       mkdir -p $SPACK_ROOT &&
       git clone --depth 50 https://github.com/spack/spack.git $SPACK_ROOT &&
       echo -e "config:""\n  build_jobs:"" 2" > $SPACK_ROOT/etc/spack/config.yaml &&
       echo -e "packages:""\n  cmake:""\n    version:"" [3.11.4]""\n    paths:""\n      cmake@3.11.4:"" /home/travis/.cache/cmake-3.11.4""\n    buildable:"" False" > $SPACK_ROOT/etc/spack/packages.yaml;
+=======
+  - export PATH=$CMAKE_ROOT/bin:$PATH
+  - CMAKE_3_11_4_FOUND=$(cmake --version | grep " 3\.11\.4" >/dev/null && { echo 0; } || { echo 1; })
+  - if [ $CMAKE_3_11_4_FOUND -ne 0 ]; then
+      mkdir -p $CMAKE_ROOT &&
+      cd $CMAKE_ROOT &&
+      rm -rf $CMAKE_ROOT/* &&
+      travis_retry wget --no-check-certificate http://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.tar.gz &&
+      tar -xzf cmake-3.11.4-Linux-x86_64.tar.gz &&
+      mv cmake-3.11.4-Linux-x86_64/* . &&
+      rm -rf cmake-3.11.4-Linux-x86_64.tar.gz cmake-3.11.4-Linux-x86_64 &&
+      cd -;
+>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c
     fi
   - spack compiler add
   # required dependencies - CMake 3.11.4
@@ -69,6 +83,7 @@ install:
   - spack load cmake
   - spack load boost $COMPILERSPEC
 
+<<<<<<< HEAD
 jobs:
   fast_finish: true
   include:
@@ -178,3 +193,63 @@ jobs:
         - export CXX=clang++
         - export CC=clang
         - export FC=gfortran-4.9
+=======
+script:
+  #############################################################################
+  # Example: Matrix Multiplication (adapted original)                         #
+  #############################################################################
+  - cd $HOME/matrixMul
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/matrixMul/ $CMAKE_FLAGS
+  - make
+  # can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original
+  #   SDK example
+  # CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (256)
+  # - ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64
+  #############################################################################
+  # Example: Async API (adapted original)                                     #
+  #############################################################################
+  - cd $HOME/asyncAPI
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI/ $CMAKE_FLAGS
+  - make
+  # can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original
+  #   SDK example
+  # CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (512)
+  # - ./asyncAPI
+  #############################################################################
+  # Example: Async API (added elements layer)                                 #
+  #############################################################################
+  - cd $HOME/asyncAPI_tuned
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI_tuned/ $CMAKE_FLAGS
+  - make
+  - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
+       [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
+      ./asyncAPI_tuned;
+    fi
+  #############################################################################
+  # Example: vectorAdd (added elements layer)                                 #
+  #############################################################################
+  - cd $HOME/vectorAdd
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS
+  - make
+  - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
+       [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
+      ./vectorAdd 100000;
+    fi
+  #############################################################################
+  # Example: BlackScholes (adapted original)                                 #
+  #############################################################################
+  - cd $HOME/blackScholes
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/blackScholes/ $CMAKE_FLAGS
+  - make
+  - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
+       [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
+      ./blackScholes;
+    fi
+  #############################################################################
+  # Test: additional tests                                                    #
+  #############################################################################
+  - cd $HOME/test/config
+  - if [[ $CXX =~ "^g\+\+" ]] || [[ "$COMPILER" == "nvcc" ]] ; then
+      $TRAVIS_BUILD_DIR/test/system/config/test.sh $CXX;
+    fi
+>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c
diff --git a/README.md b/README.md
@@ -235,4 +235,4 @@ way!
 ********************************************************************************
 
 ![image of an lwfa](docs/images/lwfa_iso.png "LWFA")
-![image of our strong scaling](docs/images/StrongScalingPIConGPU_log.png "Strong Scaling")
+![image of our strong scaling](docs/images/StrongScalingPIConGPU_log.png "Strong Scaling")
diff --git a/alpaka/example/vectorAdd/src/vectorAdd.cpp b/alpaka/example/vectorAdd/src/vectorAdd.cpp
@@ -0,0 +1,213 @@
+/* Copyright 2019 Benjamin Worpitz, Matthias Werner
+ *
+ * This file exemplifies usage of Alpaka.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <alpaka/alpaka.hpp>
+
+#include <random>
+#include <iostream>
+#include <typeinfo>
+
+//#############################################################################
+//! A vector addition kernel.
+class VectorAddKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    //! The kernel entry point.
+    //!
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam TElem The matrix element type.
+    //! \param acc The accelerator to be executed on.
+    //! \param A The first source vector.
+    //! \param B The second source vector.
+    //! \param C The destination vector.
+    //! \param numElements The number of elements.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<
+        typename TAcc,
+        typename TElem,
+        typename TIdx>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const & acc,
+        TElem const * const A,
+        TElem const * const B,
+        TElem * const C,
+        TIdx const & numElements) const
+    -> void
+    {
+        static_assert(
+            alpaka::dim::Dim<TAcc>::value == 1,
+            "The VectorAddKernel expects 1-dimensional indices!");
+
+        TIdx const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        TIdx const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+        TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
+
+        if(threadFirstElemIdx < numElements)
+        {
+            // Calculate the number of elements to compute in this thread.
+            // The result is uniform for all but the last thread.
+            TIdx const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
+            TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
+
+            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
+            {
+                C[i] = A[i] + B[i];
+            }
+        }
+    }
+};
+
+auto main()
+-> int
+{
+// Fallback for the CI with disabled sequential backend
+#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    return EXIT_SUCCESS;
+#else
+    // Define the index domain
+    using Dim = alpaka::dim::DimInt<1u>;
+    using Idx = std::size_t;
+
+    // Define the accelerator
+    //
+    // It is possible to choose from a set of accelerators
+    // that are defined in the alpaka::acc namespace e.g.:
+    // - AccGpuCudaRt
+    // - AccCpuThreads
+    // - AccCpuFibers
+    // - AccCpuOmp2Threads
+    // - AccCpuOmp2Blocks
+    // - AccCpuOmp4
+    // - AccCpuSerial
+    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
+    using DevAcc = alpaka::dev::Dev<Acc>;
+    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
+
+    // Defines the synchronization behavior of a queue
+    //
+    // choose between Blocking and NonBlocking
+    using QueueProperty = alpaka::queue::Blocking;
+    using QueueAcc = alpaka::queue::Queue<Acc, QueueProperty>;
+
+    // Select a device
+    DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+
+    // Create a queue on the device
+    QueueAcc queue(devAcc);
+
+    // Define the work division
+    Idx const numElements(123456);
+    Idx const elementsPerThread(3u);
+    alpaka::vec::Vec<Dim, Idx> const extent(numElements);
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
+        alpaka::workdiv::getValidWorkDiv<Acc>(
+            devAcc,
+            extent,
+            elementsPerThread,
+            false,
+            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    // Define the buffer element type
+    using Data = std::uint32_t;
+
+    // Get the host device for allocating memory on the host.
+    using DevHost = alpaka::dev::DevCpu;
+    using PltfHost = alpaka::pltf::Pltf<DevHost>;
+    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+
+    // Allocate 3 host memory buffers
+    using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>;
+    BufHost bufHostA(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
+    BufHost bufHostB(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
+    BufHost bufHostC(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
+
+    // Initialize the host input vectors A and B
+    Data * const pBufHostA(alpaka::mem::view::getPtrNative(bufHostA));
+    Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB));
+    Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC));
+
+    // C++11 random generator for uniformly distributed numbers in {1,..,42}
+    std::random_device rd{};
+    std::default_random_engine eng{ rd() };
+    std::uniform_int_distribution<Data> dist(1, 42);
+
+    for (Idx i(0); i < numElements; ++i)
+    {
+        pBufHostA[i] = dist(eng);
+        pBufHostB[i] = dist(eng);
+        pBufHostC[i] = 0;
+    }
+
+    // Allocate 3 buffers on the accelerator
+    using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>;
+    BufAcc bufAccA(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
+    BufAcc bufAccB(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
+    BufAcc bufAccC(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
+
+    // Copy Host -> Acc
+    alpaka::mem::view::copy(queue, bufAccA, bufHostA, extent);
+    alpaka::mem::view::copy(queue, bufAccB, bufHostB, extent);
+    alpaka::mem::view::copy(queue, bufAccC, bufHostC, extent);
+
+    // Instantiate the kernel function object
+    VectorAddKernel kernel;
+
+    // Create the kernel execution task.
+    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
+        workDiv,
+        kernel,
+        alpaka::mem::view::getPtrNative(bufAccA),
+        alpaka::mem::view::getPtrNative(bufAccB),
+        alpaka::mem::view::getPtrNative(bufAccC),
+        numElements));
+
+    // Enqueue the kernel execution task
+    alpaka::queue::enqueue(queue, taskKernel);
+
+    // Copy back the result
+    alpaka::mem::view::copy(queue, bufHostC, bufAccC, extent);
+    alpaka::wait::wait(queue);
+
+    bool resultCorrect(true);
+    for(Idx i(0u);
+        i < numElements;
+        ++i)
+    {
+        Data const & val(pBufHostC[i]);
+        Data const correctResult(pBufHostA[i] + pBufHostB[i]);
+        if(val != correctResult)
+        {
+            std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
+            resultCorrect = false;
+        }
+    }
+
+    if(resultCorrect)
+    {
+        std::cout << "Execution results correct!" << std::endl;
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        std::cout << "Execution results incorrect!" << std::endl;
+        return EXIT_FAILURE;
+    }
+#endif
+}