forked from ComputationalRadiationPhysics/picongpu
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c
f60a0ac72c Merge pull request ComputationalRadiationPhysics#155 from ComputationalRadiationPhysics/dev 4c73dde10d Merge pull request ComputationalRadiationPhysics#152 from psychocoderHPC/topic-changeLogVersionIncreaseTo0.2.0 fc428aa8e9 update release date for 0.2.0 1e32118c80 Merge pull request ComputationalRadiationPhysics#154 from sbastrakov/topic-addInt3Float3 84008fb6ae Add float3 and int3 with make-functions 4d5eace02a Merge pull request ComputationalRadiationPhysics#153 from sbastrakov/topic-finalizeBlackScholes2 bc35f6a822 Add Black-Scholes example 4b1ab5722f Merge pull request ComputationalRadiationPhysics#146 from psychocoderHPC/topic-changeLogVersionIncreaseTo0.2.0 52ddba6b26 changelog and version increase to 0.2.0 aabb8d5339 Merge pull request ComputationalRadiationPhysics#145 from psychocoderHPC/fix-minimalCMakeRequirements 7795e58c0c Merge pull request ComputationalRadiationPhysics#147 from psychocoderHPC/topic-updateAuthorSection 974afe8451 Merge pull request ComputationalRadiationPhysics#150 from sbastrakov/doc-clarifyThreadUnsafety caf4fdda1e Extend the docs with thread unsafety 91d78da7eb Merge pull request ComputationalRadiationPhysics#149 from psychocoderHPC/topic-setTravisCmakeToMinimumRequiredVersion a3bc920dfa use CMake 3.11.4 for travis tests 9ccab281e9 update author section 6bc13a220c Merge pull request ComputationalRadiationPhysics#148 from psychocoderHPC/topic-cuplaLogo 406fb5b93a update minimal CMake requirements to 3.11.4 9c32408da3 add cupla logo cd6b9859d4 Merge pull request ComputationalRadiationPhysics#144 from psychocoderHPC/topic-hipPinnedMemory c9fb7dd047 pin memory allocated with `cuplaMallocHoast` 92f8c82474 Merge pull request ComputationalRadiationPhysics#143 from psychocoderHPC/topic-updateAlpakaTo0.4.0 27f0bca235 Merge commit '0f175188a374a0194145728f2083256e1eb14b13' into topic-updateAlpakaTo0.4.0 0f175188a3 Squashed 'alpaka/' changes from d5e59590f..ab0b8a460 1e4aea9747 Merge pull request ComputationalRadiationPhysics#142 from sbastrakov/fix-typeCastWarnings 8f453a4bf5 Fix warnings concerning type casting 08f80ef24b Merge pull request ComputationalRadiationPhysics#141 from sbastrakov/topic-extendAtomicComments b3d34b28ab Extend comments of atomic functions git-subtree-dir: thirdParty/cupla git-subtree-split: f60a0ac72cb175d4a8a67301882f7a6b1de4c3c3
- Loading branch information
1 parent
0dea7e2
commit 94b25fb
Showing
21 changed files
with
1,920 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
/* Copyright 2019 Benjamin Worpitz, Matthias Werner | ||
* | ||
* This file exemplifies usage of Alpaka. | ||
* | ||
* Permission to use, copy, modify, and/or distribute this software for any | ||
* purpose with or without fee is hereby granted, provided that the above | ||
* copyright notice and this permission notice appear in all copies. | ||
* | ||
* THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH | ||
* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY | ||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR | ||
* IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
*/ | ||
|
||
#include <alpaka/alpaka.hpp> | ||
|
||
#include <random> | ||
#include <iostream> | ||
#include <typeinfo> | ||
|
||
//############################################################################# | ||
//! A vector addition kernel. | ||
class VectorAddKernel | ||
{ | ||
public: | ||
//----------------------------------------------------------------------------- | ||
//! The kernel entry point. | ||
//! | ||
//! \tparam TAcc The accelerator environment to be executed on. | ||
//! \tparam TElem The matrix element type. | ||
//! \param acc The accelerator to be executed on. | ||
//! \param A The first source vector. | ||
//! \param B The second source vector. | ||
//! \param C The destination vector. | ||
//! \param numElements The number of elements. | ||
ALPAKA_NO_HOST_ACC_WARNING | ||
template< | ||
typename TAcc, | ||
typename TElem, | ||
typename TIdx> | ||
ALPAKA_FN_ACC auto operator()( | ||
TAcc const & acc, | ||
TElem const * const A, | ||
TElem const * const B, | ||
TElem * const C, | ||
TIdx const & numElements) const | ||
-> void | ||
{ | ||
static_assert( | ||
alpaka::dim::Dim<TAcc>::value == 1, | ||
"The VectorAddKernel expects 1-dimensional indices!"); | ||
|
||
TIdx const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]); | ||
TIdx const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]); | ||
TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent); | ||
|
||
if(threadFirstElemIdx < numElements) | ||
{ | ||
// Calculate the number of elements to compute in this thread. | ||
// The result is uniform for all but the last thread. | ||
TIdx const threadLastElemIdx(threadFirstElemIdx+threadElemExtent); | ||
TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements); | ||
|
||
for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i) | ||
{ | ||
C[i] = A[i] + B[i]; | ||
} | ||
} | ||
} | ||
}; | ||
|
||
auto main() | ||
-> int | ||
{ | ||
// Fallback for the CI with disabled sequential backend | ||
#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) | ||
return EXIT_SUCCESS; | ||
#else | ||
// Define the index domain | ||
using Dim = alpaka::dim::DimInt<1u>; | ||
using Idx = std::size_t; | ||
|
||
// Define the accelerator | ||
// | ||
// It is possible to choose from a set of accelerators | ||
// that are defined in the alpaka::acc namespace e.g.: | ||
// - AccGpuCudaRt | ||
// - AccCpuThreads | ||
// - AccCpuFibers | ||
// - AccCpuOmp2Threads | ||
// - AccCpuOmp2Blocks | ||
// - AccCpuOmp4 | ||
// - AccCpuSerial | ||
using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>; | ||
using DevAcc = alpaka::dev::Dev<Acc>; | ||
using PltfAcc = alpaka::pltf::Pltf<DevAcc>; | ||
|
||
// Defines the synchronization behavior of a queue | ||
// | ||
// choose between Blocking and NonBlocking | ||
using QueueProperty = alpaka::queue::Blocking; | ||
using QueueAcc = alpaka::queue::Queue<Acc, QueueProperty>; | ||
|
||
// Select a device | ||
DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u)); | ||
|
||
// Create a queue on the device | ||
QueueAcc queue(devAcc); | ||
|
||
// Define the work division | ||
Idx const numElements(123456); | ||
Idx const elementsPerThread(3u); | ||
alpaka::vec::Vec<Dim, Idx> const extent(numElements); | ||
|
||
// Let alpaka calculate good block and grid sizes given our full problem extent | ||
alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv( | ||
alpaka::workdiv::getValidWorkDiv<Acc>( | ||
devAcc, | ||
extent, | ||
elementsPerThread, | ||
false, | ||
alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted)); | ||
|
||
// Define the buffer element type | ||
using Data = std::uint32_t; | ||
|
||
// Get the host device for allocating memory on the host. | ||
using DevHost = alpaka::dev::DevCpu; | ||
using PltfHost = alpaka::pltf::Pltf<DevHost>; | ||
DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u)); | ||
|
||
// Allocate 3 host memory buffers | ||
using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>; | ||
BufHost bufHostA(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent)); | ||
BufHost bufHostB(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent)); | ||
BufHost bufHostC(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent)); | ||
|
||
// Initialize the host input vectors A and B | ||
Data * const pBufHostA(alpaka::mem::view::getPtrNative(bufHostA)); | ||
Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB)); | ||
Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC)); | ||
|
||
// C++11 random generator for uniformly distributed numbers in {1,..,42} | ||
std::random_device rd{}; | ||
std::default_random_engine eng{ rd() }; | ||
std::uniform_int_distribution<Data> dist(1, 42); | ||
|
||
for (Idx i(0); i < numElements; ++i) | ||
{ | ||
pBufHostA[i] = dist(eng); | ||
pBufHostB[i] = dist(eng); | ||
pBufHostC[i] = 0; | ||
} | ||
|
||
// Allocate 3 buffers on the accelerator | ||
using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>; | ||
BufAcc bufAccA(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent)); | ||
BufAcc bufAccB(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent)); | ||
BufAcc bufAccC(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent)); | ||
|
||
// Copy Host -> Acc | ||
alpaka::mem::view::copy(queue, bufAccA, bufHostA, extent); | ||
alpaka::mem::view::copy(queue, bufAccB, bufHostB, extent); | ||
alpaka::mem::view::copy(queue, bufAccC, bufHostC, extent); | ||
|
||
// Instantiate the kernel function object | ||
VectorAddKernel kernel; | ||
|
||
// Create the kernel execution task. | ||
auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>( | ||
workDiv, | ||
kernel, | ||
alpaka::mem::view::getPtrNative(bufAccA), | ||
alpaka::mem::view::getPtrNative(bufAccB), | ||
alpaka::mem::view::getPtrNative(bufAccC), | ||
numElements)); | ||
|
||
// Enqueue the kernel execution task | ||
alpaka::queue::enqueue(queue, taskKernel); | ||
|
||
// Copy back the result | ||
alpaka::mem::view::copy(queue, bufHostC, bufAccC, extent); | ||
alpaka::wait::wait(queue); | ||
|
||
bool resultCorrect(true); | ||
for(Idx i(0u); | ||
i < numElements; | ||
++i) | ||
{ | ||
Data const & val(pBufHostC[i]); | ||
Data const correctResult(pBufHostA[i] + pBufHostB[i]); | ||
if(val != correctResult) | ||
{ | ||
std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl; | ||
resultCorrect = false; | ||
} | ||
} | ||
|
||
if(resultCorrect) | ||
{ | ||
std::cout << "Execution results correct!" << std::endl; | ||
return EXIT_SUCCESS; | ||
} | ||
else | ||
{ | ||
std::cout << "Execution results incorrect!" << std::endl; | ||
return EXIT_FAILURE; | ||
} | ||
#endif | ||
} |
Oops, something went wrong.