Skip to content

Commit

Permalink
Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c
Browse files Browse the repository at this point in the history
f60a0ac72c Merge pull request ComputationalRadiationPhysics#155 from ComputationalRadiationPhysics/dev
4c73dde10d Merge pull request ComputationalRadiationPhysics#152 from psychocoderHPC/topic-changeLogVersionIncreaseTo0.2.0
fc428aa8e9 update release date for 0.2.0
1e32118c80 Merge pull request ComputationalRadiationPhysics#154 from sbastrakov/topic-addInt3Float3
84008fb6ae Add float3 and int3 with make-functions
4d5eace02a Merge pull request ComputationalRadiationPhysics#153 from sbastrakov/topic-finalizeBlackScholes2
bc35f6a822 Add Black-Scholes example
4b1ab5722f Merge pull request ComputationalRadiationPhysics#146 from psychocoderHPC/topic-changeLogVersionIncreaseTo0.2.0
52ddba6b26 changelog and version increase to 0.2.0
aabb8d5339 Merge pull request ComputationalRadiationPhysics#145 from psychocoderHPC/fix-minimalCMakeRequirements
7795e58c0c Merge pull request ComputationalRadiationPhysics#147 from psychocoderHPC/topic-updateAuthorSection
974afe8451 Merge pull request ComputationalRadiationPhysics#150 from sbastrakov/doc-clarifyThreadUnsafety
caf4fdda1e Extend the docs with thread unsafety
91d78da7eb Merge pull request ComputationalRadiationPhysics#149 from psychocoderHPC/topic-setTravisCmakeToMinimumRequiredVersion
a3bc920dfa use CMake 3.11.4 for travis tests
9ccab281e9 update author section
6bc13a220c Merge pull request ComputationalRadiationPhysics#148 from psychocoderHPC/topic-cuplaLogo
406fb5b93a update minimal CMake requirements to 3.11.4
9c32408da3 add cupla logo
cd6b9859d4 Merge pull request ComputationalRadiationPhysics#144 from psychocoderHPC/topic-hipPinnedMemory
c9fb7dd047 pin memory allocated with `cuplaMallocHoast`
92f8c82474 Merge pull request ComputationalRadiationPhysics#143 from psychocoderHPC/topic-updateAlpakaTo0.4.0
27f0bca235 Merge commit '0f175188a374a0194145728f2083256e1eb14b13' into topic-updateAlpakaTo0.4.0
0f175188a3 Squashed 'alpaka/' changes from d5e59590f..ab0b8a460
1e4aea9747 Merge pull request ComputationalRadiationPhysics#142 from sbastrakov/fix-typeCastWarnings
8f453a4bf5 Fix warnings concerning type casting
08f80ef24b Merge pull request ComputationalRadiationPhysics#141 from sbastrakov/topic-extendAtomicComments
b3d34b28ab Extend comments of atomic functions

git-subtree-dir: thirdParty/cupla
git-subtree-split: f60a0ac72cb175d4a8a67301882f7a6b1de4c3c3
  • Loading branch information
Third Party authored and BrianMarre committed Apr 29, 2020
1 parent 0dea7e2 commit 94b25fb
Show file tree
Hide file tree
Showing 21 changed files with 1,920 additions and 1 deletion.
75 changes: 75 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,26 @@ install:
#############################################################################
# PMacc CPU-only dependencies #
#############################################################################
<<<<<<< HEAD
- SPACK_FOUND=$(which spack >/dev/null && { echo 0; } || { echo 1; })
- if [ $SPACK_FOUND -ne 0 ]; then
mkdir -p $SPACK_ROOT &&
git clone --depth 50 https://github.com/spack/spack.git $SPACK_ROOT &&
echo -e "config:""\n build_jobs:"" 2" > $SPACK_ROOT/etc/spack/config.yaml &&
echo -e "packages:""\n cmake:""\n version:"" [3.11.4]""\n paths:""\n cmake@3.11.4:"" /home/travis/.cache/cmake-3.11.4""\n buildable:"" False" > $SPACK_ROOT/etc/spack/packages.yaml;
=======
- export PATH=$CMAKE_ROOT/bin:$PATH
- CMAKE_3_11_4_FOUND=$(cmake --version | grep " 3\.11\.4" >/dev/null && { echo 0; } || { echo 1; })
- if [ $CMAKE_3_11_4_FOUND -ne 0 ]; then
mkdir -p $CMAKE_ROOT &&
cd $CMAKE_ROOT &&
rm -rf $CMAKE_ROOT/* &&
travis_retry wget --no-check-certificate http://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.tar.gz &&
tar -xzf cmake-3.11.4-Linux-x86_64.tar.gz &&
mv cmake-3.11.4-Linux-x86_64/* . &&
rm -rf cmake-3.11.4-Linux-x86_64.tar.gz cmake-3.11.4-Linux-x86_64 &&
cd -;
>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c
fi
- spack compiler add
# required dependencies - CMake 3.11.4
Expand Down Expand Up @@ -69,6 +83,7 @@ install:
- spack load cmake
- spack load boost $COMPILERSPEC

<<<<<<< HEAD
jobs:
fast_finish: true
include:
Expand Down Expand Up @@ -178,3 +193,63 @@ jobs:
- export CXX=clang++
- export CC=clang
- export FC=gfortran-4.9
=======
script:
#############################################################################
# Example: Matrix Multiplication (adapted original) #
#############################################################################
- cd $HOME/matrixMul
- cmake $TRAVIS_BUILD_DIR/example/CUDASamples/matrixMul/ $CMAKE_FLAGS
- make
# can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original
# SDK example
# CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (256)
# - ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64
#############################################################################
# Example: Async API (adapted original) #
#############################################################################
- cd $HOME/asyncAPI
- cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI/ $CMAKE_FLAGS
- make
# can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original
# SDK example
# CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (512)
# - ./asyncAPI
#############################################################################
# Example: Async API (added elements layer) #
#############################################################################
- cd $HOME/asyncAPI_tuned
- cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI_tuned/ $CMAKE_FLAGS
- make
- if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
[ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
./asyncAPI_tuned;
fi
#############################################################################
# Example: vectorAdd (added elements layer) #
#############################################################################
- cd $HOME/vectorAdd
- cmake $TRAVIS_BUILD_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS
- make
- if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
[ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
./vectorAdd 100000;
fi
#############################################################################
# Example: BlackScholes (adapted original) #
#############################################################################
- cd $HOME/blackScholes
- cmake $TRAVIS_BUILD_DIR/example/CUDASamples/blackScholes/ $CMAKE_FLAGS
- make
- if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
[ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
./blackScholes;
fi
#############################################################################
# Test: additional tests #
#############################################################################
- cd $HOME/test/config
- if [[ $CXX =~ "^g\+\+" ]] || [[ "$COMPILER" == "nvcc" ]] ; then
$TRAVIS_BUILD_DIR/test/system/config/test.sh $CXX;
fi
>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,4 +235,4 @@ way!
********************************************************************************

![image of an lwfa](docs/images/lwfa_iso.png "LWFA")
![image of our strong scaling](docs/images/StrongScalingPIConGPU_log.png "Strong Scaling")
![image of our strong scaling](docs/images/StrongScalingPIConGPU_log.png "Strong Scaling")
213 changes: 213 additions & 0 deletions alpaka/example/vectorAdd/src/vectorAdd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
/* Copyright 2019 Benjamin Worpitz, Matthias Werner
*
* This file exemplifies usage of Alpaka.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
* IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/

#include <alpaka/alpaka.hpp>

#include <random>
#include <iostream>
#include <typeinfo>

//#############################################################################
//! A vector addition kernel.
class VectorAddKernel
{
public:
//-----------------------------------------------------------------------------
//! The kernel entry point.
//!
//! \tparam TAcc The accelerator environment to be executed on.
//! \tparam TElem The matrix element type.
//! \param acc The accelerator to be executed on.
//! \param A The first source vector.
//! \param B The second source vector.
//! \param C The destination vector.
//! \param numElements The number of elements.
ALPAKA_NO_HOST_ACC_WARNING
template<
typename TAcc,
typename TElem,
typename TIdx>
ALPAKA_FN_ACC auto operator()(
TAcc const & acc,
TElem const * const A,
TElem const * const B,
TElem * const C,
TIdx const & numElements) const
-> void
{
static_assert(
alpaka::dim::Dim<TAcc>::value == 1,
"The VectorAddKernel expects 1-dimensional indices!");

TIdx const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
TIdx const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);

if(threadFirstElemIdx < numElements)
{
// Calculate the number of elements to compute in this thread.
// The result is uniform for all but the last thread.
TIdx const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);

for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
{
C[i] = A[i] + B[i];
}
}
}
};

auto main()
-> int
{
// Fallback for the CI with disabled sequential backend
#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
return EXIT_SUCCESS;
#else
// Define the index domain
using Dim = alpaka::dim::DimInt<1u>;
using Idx = std::size_t;

// Define the accelerator
//
// It is possible to choose from a set of accelerators
// that are defined in the alpaka::acc namespace e.g.:
// - AccGpuCudaRt
// - AccCpuThreads
// - AccCpuFibers
// - AccCpuOmp2Threads
// - AccCpuOmp2Blocks
// - AccCpuOmp4
// - AccCpuSerial
using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
using DevAcc = alpaka::dev::Dev<Acc>;
using PltfAcc = alpaka::pltf::Pltf<DevAcc>;

// Defines the synchronization behavior of a queue
//
// choose between Blocking and NonBlocking
using QueueProperty = alpaka::queue::Blocking;
using QueueAcc = alpaka::queue::Queue<Acc, QueueProperty>;

// Select a device
DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));

// Create a queue on the device
QueueAcc queue(devAcc);

// Define the work division
Idx const numElements(123456);
Idx const elementsPerThread(3u);
alpaka::vec::Vec<Dim, Idx> const extent(numElements);

// Let alpaka calculate good block and grid sizes given our full problem extent
alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
alpaka::workdiv::getValidWorkDiv<Acc>(
devAcc,
extent,
elementsPerThread,
false,
alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));

// Define the buffer element type
using Data = std::uint32_t;

// Get the host device for allocating memory on the host.
using DevHost = alpaka::dev::DevCpu;
using PltfHost = alpaka::pltf::Pltf<DevHost>;
DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));

// Allocate 3 host memory buffers
using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>;
BufHost bufHostA(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
BufHost bufHostB(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
BufHost bufHostC(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));

// Initialize the host input vectors A and B
Data * const pBufHostA(alpaka::mem::view::getPtrNative(bufHostA));
Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB));
Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC));

// C++11 random generator for uniformly distributed numbers in {1,..,42}
std::random_device rd{};
std::default_random_engine eng{ rd() };
std::uniform_int_distribution<Data> dist(1, 42);

for (Idx i(0); i < numElements; ++i)
{
pBufHostA[i] = dist(eng);
pBufHostB[i] = dist(eng);
pBufHostC[i] = 0;
}

// Allocate 3 buffers on the accelerator
using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>;
BufAcc bufAccA(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
BufAcc bufAccB(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
BufAcc bufAccC(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));

// Copy Host -> Acc
alpaka::mem::view::copy(queue, bufAccA, bufHostA, extent);
alpaka::mem::view::copy(queue, bufAccB, bufHostB, extent);
alpaka::mem::view::copy(queue, bufAccC, bufHostC, extent);

// Instantiate the kernel function object
VectorAddKernel kernel;

// Create the kernel execution task.
auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
workDiv,
kernel,
alpaka::mem::view::getPtrNative(bufAccA),
alpaka::mem::view::getPtrNative(bufAccB),
alpaka::mem::view::getPtrNative(bufAccC),
numElements));

// Enqueue the kernel execution task
alpaka::queue::enqueue(queue, taskKernel);

// Copy back the result
alpaka::mem::view::copy(queue, bufHostC, bufAccC, extent);
alpaka::wait::wait(queue);

bool resultCorrect(true);
for(Idx i(0u);
i < numElements;
++i)
{
Data const & val(pBufHostC[i]);
Data const correctResult(pBufHostA[i] + pBufHostB[i]);
if(val != correctResult)
{
std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
resultCorrect = false;
}
}

if(resultCorrect)
{
std::cout << "Execution results correct!" << std::endl;
return EXIT_SUCCESS;
}
else
{
std::cout << "Execution results incorrect!" << std::endl;
return EXIT_FAILURE;
}
#endif
}
Loading

0 comments on commit 94b25fb

Please sign in to comment.