Skip to content

Commit

Permalink
support extents with arbitrary value types
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed May 11, 2022
1 parent 50bac5f commit da970d8
Show file tree
Hide file tree
Showing 67 changed files with 863 additions and 605 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ if (BUILD_TESTING)
target_compile_options(tests PRIVATE /permissive- /constexpr:steps10000000 /diagnostics:caret)
else()
target_compile_features(tests PRIVATE cxx_std_20)
target_compile_options(tests PRIVATE -Wall -Wextra -Wno-missing-braces)
target_compile_options(tests PRIVATE -Wall -Wextra -Werror=narrowing -Wno-missing-braces)
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
target_compile_options(tests PRIVATE -fconstexpr-steps=10000000)
Expand Down
2 changes: 1 addition & 1 deletion docs/pages/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Array dimensions

.. doxygenstruct:: llama::ArrayExtents
.. doxygentypedef:: llama::ArrayExtentsDynamic
.. doxygentypedef:: llama::ArrayExtentsStatic
.. doxygentypedef:: llama::ArrayExtentsNCube
.. doxygenstruct:: llama::ArrayIndex

.. doxygenstruct:: llama::ArrayIndexIterator
Expand Down
8 changes: 5 additions & 3 deletions examples/alpaka/asyncblur/asyncblur.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ struct BlurKernel
{
// Using SoA for the shared memory
constexpr auto sharedChunkSize = ElemsPerBlock + 2 * KernelSize;
constexpr auto sharedMapping = llama::mapping::
SoA<llama::ArrayExtents<sharedChunkSize, sharedChunkSize>, typename View::RecordDim, false>{};
constexpr auto sharedMapping = llama::mapping::SoA<
llama::ArrayExtents<std::size_t, sharedChunkSize, sharedChunkSize>,
typename View::RecordDim,
false>{};
auto& sharedMem = alpaka::declareSharedVar<std::byte[sharedMapping.blobSize(0)], __COUNTER__>(acc);
return llama::View(sharedMapping, llama::Array<std::byte*, 1>{&sharedMem[0]});
}
Expand Down Expand Up @@ -211,7 +213,7 @@ try
const auto hostMapping
= llama::mapping::tree::Mapping{llama::ArrayExtents{buffer_y, buffer_x}, treeOperationList, Pixel{}};
const auto devMapping = llama::mapping::tree::Mapping{
llama::ArrayExtents<CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
llama::ArrayExtents<std::size_t, CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
treeOperationList,
PixelOnAcc{}};
using DevMapping = std::decay_t<decltype(devMapping)>;
Expand Down
28 changes: 14 additions & 14 deletions examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v)
v.store(&dst);
}

template<std::size_t Elems>
template<int Elems>
struct VecType
{
// TODO(bgruber): we need a vector type that also works on GPUs
Expand All @@ -116,7 +116,7 @@ struct VecType<1>
using type = FP;
};

template<std::size_t Elems, typename ViewParticleI, typename VirtualParticleJ>
template<int Elems, typename ViewParticleI, typename VirtualParticleJ>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, VirtualParticleJ pj)
{
using Vec = typename VecType<Elems>::type;
Expand All @@ -143,7 +143,7 @@ LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, VirtualParticleJ p
store<Vec>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Z{})));
}

template<std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize, Mapping MappingSM>
template<int ProblemSize, int Elems, int BlockSize, Mapping MappingSM>
struct UpdateKernel
{
template<typename Acc, typename View>
Expand All @@ -158,7 +158,7 @@ struct UpdateKernel
{
constexpr auto sharedMapping = []
{
using ArrayExtents = llama::ArrayExtents<BlockSize>;
using ArrayExtents = llama::ArrayExtents<int, BlockSize>;
if constexpr(MappingSM == AoS)
return llama::mapping::AoS<ArrayExtents, Particle>{};
if constexpr(MappingSM == SoA)
Expand All @@ -181,7 +181,7 @@ struct UpdateKernel
auto pi = [&]
{
constexpr auto mapping
= llama::mapping::SoA<llama::ArrayExtents<Elems>, typename View::RecordDim, false>{};
= llama::mapping::SoA<llama::ArrayExtents<int, Elems>, typename View::RecordDim, false>{};
return llama::allocViewUninitialized(mapping, llama::bloballoc::Stack<mapping.blobSize(0)>{});
}();
// TODO(bgruber): vector load
Expand All @@ -190,26 +190,26 @@ struct UpdateKernel
pi(e) = particles(ti * Elems + e);

LLAMA_INDEPENDENT_DATA
for(std::size_t blockOffset = 0; blockOffset < ProblemSize; blockOffset += BlockSize)
for(int blockOffset = 0; blockOffset < ProblemSize; blockOffset += BlockSize)
{
LLAMA_INDEPENDENT_DATA
for(auto j = tbi; j < BlockSize; j += THREADS_PER_BLOCK)
for(int j = tbi; j < BlockSize; j += THREADS_PER_BLOCK)
sharedView(j) = particles(blockOffset + j);
alpaka::syncBlockThreads(acc);

LLAMA_INDEPENDENT_DATA
for(auto j = std::size_t{0}; j < BlockSize; ++j)
for(int j = 0; j < BlockSize; ++j)
pPInteraction<Elems>(pi(0u), sharedView(j));
alpaka::syncBlockThreads(acc);
}
// TODO(bgruber): vector store
LLAMA_INDEPENDENT_DATA
for(auto e = 0u; e < Elems; e++)
for(int e = 0u; e < Elems; e++)
particles(ti * Elems + e) = pi(e);
}
};

template<std::size_t ProblemSize, std::size_t Elems>
template<int ProblemSize, int Elems>
struct MoveKernel
{
template<typename Acc, typename View>
Expand All @@ -235,7 +235,7 @@ template<template<typename, typename> typename AccTemplate, Mapping MappingGM, M
void run(std::ostream& plotFile)
{
using Dim = alpaka::DimInt<1>;
using Size = std::size_t;
using Size = int;
using Acc = AccTemplate<Dim, Size>;
using DevHost = alpaka::DevCpu;
using DevAcc = alpaka::Dev<Acc>;
Expand All @@ -262,7 +262,7 @@ void run(std::ostream& plotFile)

auto mapping = []
{
using ArrayExtents = llama::ArrayExtents<llama::dyn>;
using ArrayExtents = llama::ArrayExtentsDynamic<1, int>;
const auto extents = ArrayExtents{PROBLEM_SIZE};
if constexpr(MappingGM == AoS)
return llama::mapping::AoS<ArrayExtents, Particle>{extents};
Expand Down Expand Up @@ -290,7 +290,7 @@ void run(std::ostream& plotFile)

std::mt19937_64 generator;
std::normal_distribution<FP> distribution(FP(0), FP(1));
for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
for(int i = 0; i < PROBLEM_SIZE; ++i)
{
llama::One<Particle> p;
p(tag::Pos(), tag::X()) = distribution(generator);
Expand All @@ -315,7 +315,7 @@ void run(std::ostream& plotFile)

double sumUpdate = 0;
double sumMove = 0;
for(std::size_t s = 0; s < STEPS; ++s)
for(int s = 0; s < STEPS; ++s)
{
auto updateKernel = UpdateKernel<PROBLEM_SIZE, DESIRED_ELEMENTS_PER_THREAD, THREADS_PER_BLOCK, MappingSM>{};
alpaka::exec<Acc>(queue, workdiv, updateKernel, accView);
Expand Down
2 changes: 1 addition & 1 deletion examples/alpaka/pic/pic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ auto setup(Queue& queue, const Dev& dev, const DevHost& devHost)

auto particleMapping = [&]
{
using ArrayExtents = llama::ArrayExtents<llama::dyn>;
using ArrayExtents = llama::ArrayExtentsDynamic<1>;
const auto particleExtents = ArrayExtents{numpart};
if constexpr(ParticleMapping == 0)
return llama::mapping::AoS<ArrayExtents, Particle>{particleExtents};
Expand Down
2 changes: 1 addition & 1 deletion examples/alpaka/vectoradd/vectoradd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ try
// LLAMA
const auto mapping = [&]
{
using ArrayExtents = llama::ArrayExtents<llama::dyn>;
using ArrayExtents = llama::ArrayExtentsDynamic<1>;
const auto extents = ArrayExtents{PROBLEM_SIZE};
if constexpr(MAPPING == 0)
return llama::mapping::AoS<ArrayExtents, Vector>{extents};
Expand Down
2 changes: 1 addition & 1 deletion examples/bitpackfloat/bitpackfloat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ auto main() -> int
constexpr auto exponentBits = 5;
constexpr auto mantissaBits = 13;
const auto mapping
= llama::mapping::BitPackedFloatSoA{llama::ArrayExtents<llama::dyn>{N}, exponentBits, mantissaBits, Vector{}};
= llama::mapping::BitPackedFloatSoA{llama::ArrayExtents{N}, exponentBits, mantissaBits, Vector{}};

auto view = llama::allocView(mapping);

Expand Down
5 changes: 3 additions & 2 deletions examples/bitpackint/bitpackint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ auto main() -> int
constexpr auto N = 128;
constexpr auto bits = 7;
const auto mapping
= llama::mapping::BitPackedIntSoA<llama::ArrayExtentsDynamic<1>, Vector, llama::Constant<bits>>{{N}};
= llama::mapping::BitPackedIntSoA<llama::ArrayExtentsDynamic<std::size_t, 1>, Vector, llama::Constant<bits>>{
{N}};

auto view = llama::allocView(mapping);

Expand All @@ -49,7 +50,7 @@ auto main() -> int

// extract into a view of full size integers
auto viewExtracted
= llama::allocViewUninitialized(llama::mapping::AoS<llama::ArrayExtents<llama::dyn>, Vector>{{N}});
= llama::allocViewUninitialized(llama::mapping::AoS<llama::ArrayExtentsDynamic<std::size_t, 1>, Vector>{{N}});
llama::copy(view, viewExtracted);
if(!std::equal(view.begin(), view.end(), viewExtracted.begin(), viewExtracted.end()))
fmt::print("ERROR: unpacked view is different\n");
Expand Down
27 changes: 14 additions & 13 deletions examples/bufferguard/bufferguard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ using Vector = llama::Record<
// clang-format on

template<template<typename, typename> typename InnerMapping, typename TRecordDim>
struct GuardMapping2D : llama::ArrayExtentsDynamic<2>
struct GuardMapping2D : llama::ArrayExtentsDynamic<std::size_t, 2>
{
using ArrayExtents = llama::ArrayExtentsDynamic<2>;
using ArrayIndex = llama::ArrayIndex<2>;
using ArrayExtents = llama::ArrayExtentsDynamic<std::size_t, 2>;
using ArrayIndex = llama::ArrayIndex<std::size_t, 2>;
using RecordDim = TRecordDim;

constexpr GuardMapping2D() = default;

constexpr explicit GuardMapping2D(ArrayExtents extents, RecordDim = {})
: llama::ArrayExtentsDynamic<2>(extents)
: llama::ArrayExtentsDynamic<std::size_t, 2>(extents)
, left({extents[0] - 2})
, right({extents[0] - 2})
, top({extents[1] - 2})
Expand Down Expand Up @@ -69,7 +69,7 @@ struct GuardMapping2D : llama::ArrayExtentsDynamic<2>

template<std::size_t... RecordCoords>
constexpr auto blobNrAndOffset(ArrayIndex ai, llama::RecordCoord<RecordCoords...> rc = {}) const
-> llama::NrAndOffset
-> llama::NrAndOffset<std::size_t>
{
// [0][0] is at left top
const auto [row, col] = ai;
Expand Down Expand Up @@ -144,7 +144,8 @@ struct GuardMapping2D : llama::ArrayExtentsDynamic<2>
}

private:
constexpr auto offsetBlobNr(llama::NrAndOffset nao, std::size_t blobNrOffset) const -> llama::NrAndOffset
constexpr auto offsetBlobNr(llama::NrAndOffset<std::size_t> nao, std::size_t blobNrOffset) const
-> llama::NrAndOffset<std::size_t>
{
nao.nr += blobNrOffset;
return nao;
Expand All @@ -162,11 +163,11 @@ struct GuardMapping2D : llama::ArrayExtentsDynamic<2>
llama::mapping::One<llama::ArrayExtents<>, RecordDim> leftBot;
llama::mapping::One<llama::ArrayExtents<>, RecordDim> rightTop;
llama::mapping::One<llama::ArrayExtents<>, RecordDim> rightBot;
InnerMapping<llama::ArrayExtentsDynamic<1>, RecordDim> left;
InnerMapping<llama::ArrayExtentsDynamic<1>, RecordDim> right;
InnerMapping<llama::ArrayExtentsDynamic<1>, RecordDim> top;
InnerMapping<llama::ArrayExtentsDynamic<1>, RecordDim> bot;
InnerMapping<llama::ArrayExtentsDynamic<2>, RecordDim> center;
InnerMapping<llama::ArrayExtentsDynamic<std::size_t, 1>, RecordDim> left;
InnerMapping<llama::ArrayExtentsDynamic<std::size_t, 1>, RecordDim> right;
InnerMapping<llama::ArrayExtentsDynamic<std::size_t, 1>, RecordDim> top;
InnerMapping<llama::ArrayExtentsDynamic<std::size_t, 1>, RecordDim> bot;
InnerMapping<llama::ArrayExtentsDynamic<std::size_t, 2>, RecordDim> center;

static constexpr auto leftTopOff = std::size_t{0};
static constexpr auto leftBotOff = leftTopOff + decltype(leftTop)::blobCount;
Expand Down Expand Up @@ -202,8 +203,8 @@ void run(const std::string& mappingName)
{
std::cout << "\n===== Mapping " << mappingName << " =====\n\n";

constexpr auto rows = 7;
constexpr auto cols = 5;
constexpr std::size_t rows = 7;
constexpr std::size_t cols = 5;
const auto extents = llama::ArrayExtents{rows, cols};
const auto mapping = GuardMapping2D<Mapping, Vector>{extents};
std::ofstream{"bufferguard_" + mappingName + ".svg"} << llama::toSvg(mapping);
Expand Down
2 changes: 1 addition & 1 deletion examples/bytesplit/bytesplit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ using Data = llama::Record<
auto main() -> int
{
constexpr auto N = 128;
using ArrayExtents = llama::ArrayExtentsDynamic<1>;
using ArrayExtents = llama::ArrayExtentsDynamic<std::size_t, 1>;
const auto mapping = llama::mapping::Bytesplit<ArrayExtents, Data, llama::mapping::BindSoA<false>::fn>{{N}};

auto view = llama::allocView(mapping);
Expand Down
20 changes: 10 additions & 10 deletions examples/cuda/nbody/nbody.cu
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ __global__ void updateSM(View particles)
{
constexpr auto sharedMapping = []
{
using ArrayExtents = llama::ArrayExtents<SHARED_ELEMENTS_PER_BLOCK>;
using ArrayExtents = llama::ArrayExtents<int, SHARED_ELEMENTS_PER_BLOCK>;
if constexpr(MappingSM == 0)
return llama::mapping::AoS<ArrayExtents, SharedMemoryParticle>{};
if constexpr(MappingSM == 1)
Expand All @@ -111,15 +111,15 @@ __global__ void updateSM(View particles)
const auto tbi = blockIdx.x;

llama::One<Particle> pi = particles(ti);
for(std::size_t blockOffset = 0; blockOffset < PROBLEM_SIZE; blockOffset += SHARED_ELEMENTS_PER_BLOCK)
for(int blockOffset = 0; blockOffset < PROBLEM_SIZE; blockOffset += SHARED_ELEMENTS_PER_BLOCK)
{
LLAMA_INDEPENDENT_DATA
for(auto j = tbi; j < SHARED_ELEMENTS_PER_BLOCK; j += THREADS_PER_BLOCK)
for(int j = tbi; j < SHARED_ELEMENTS_PER_BLOCK; j += THREADS_PER_BLOCK)
sharedView(j) = particles(blockOffset + j);
__syncthreads();

LLAMA_INDEPENDENT_DATA
for(auto j = std::size_t{0}; j < SHARED_ELEMENTS_PER_BLOCK; ++j)
for(int j = 0; j < SHARED_ELEMENTS_PER_BLOCK; ++j)
pPInteraction(pi, sharedView(j));
__syncthreads();
}
Expand All @@ -133,7 +133,7 @@ __global__ void update(View particles)

llama::One<Particle> pi = particles(ti);
LLAMA_INDEPENDENT_DATA
for(auto j = std::size_t{0}; j < PROBLEM_SIZE; ++j)
for(int j = 0; j < PROBLEM_SIZE; ++j)
pPInteraction(pi, particles(j));
particles(ti)(tag::Vel{}) = pi(tag::Vel{});
}
Expand Down Expand Up @@ -178,7 +178,7 @@ try

auto mapping = []
{
using ArrayExtents = llama::ArrayExtents<llama::dyn>;
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto extents = ArrayExtents{PROBLEM_SIZE};
if constexpr(Mapping == 0)
return llama::mapping::AoS<ArrayExtents, Particle>{extents};
Expand Down Expand Up @@ -224,7 +224,7 @@ try

std::default_random_engine engine;
std::normal_distribution<FP> distribution(FP(0), FP(1));
for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
for(int i = 0; i < PROBLEM_SIZE; ++i)
{
llama::One<Particle> p;
p(tag::Pos(), tag::X()) = distribution(engine);
Expand Down Expand Up @@ -267,7 +267,7 @@ try

double sumUpdate = 0;
double sumMove = 0;
for(std::size_t s = 0; s < STEPS; ++s)
for(int s = 0; s < STEPS; ++s)
{
if constexpr(RUN_UPATE)
{
Expand Down Expand Up @@ -388,7 +388,7 @@ namespace manual

std::default_random_engine engine;
std::normal_distribution<FP> distribution(FP(0), FP(1));
for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
for(int i = 0; i < PROBLEM_SIZE; ++i)
{
hostPositions[i].x = distribution(engine);
hostPositions[i].y = distribution(engine);
Expand Down Expand Up @@ -426,7 +426,7 @@ namespace manual

double sumUpdate = 0;
double sumMove = 0;
for(std::size_t s = 0; s < STEPS; ++s)
for(int s = 0; s < STEPS; ++s)
{
if constexpr(RUN_UPATE)
{
Expand Down
6 changes: 3 additions & 3 deletions examples/cuda/pitch/pitch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ namespace llamaex
template<std::size_t... RecordCoords>
LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(
typename Base::ArrayIndex ai,
RecordCoord<RecordCoords...> = {}) const -> NrAndOffset
RecordCoord<RecordCoords...> = {}) const -> NrAndOffset<std::size_t>
{
constexpr std::size_t flatFieldIndex =
#ifdef __NVCC__
Expand All @@ -134,7 +134,7 @@ try
prop.totalGlobalMem / 1024 / 1024,
prop.sharedMemPerBlock / 1024);

const auto extents = llama::ArrayExtents{600, 800}; // height, width
const auto extents = llama::ArrayExtents<std::size_t, llama::dyn, llama::dyn>{600, 800}; // height, width
const auto widthBytes = extents[1] * sizeof(RGB);

const auto blockDim = dim3{16, 32, 1};
Expand All @@ -151,7 +151,7 @@ try
checkError(cudaMallocPitch(&mem, &rowPitch, widthBytes, extents[0]));
fmt::print("Row pitch: {} B ({} B padding)\n", rowPitch, rowPitch - widthBytes);

auto mapping = llamaex::PitchedAoS<llama::ArrayExtentsDynamic<2>, RGB>{extents, rowPitch};
auto mapping = llamaex::PitchedAoS<llama::ArrayExtentsDynamic<std::size_t, 2>, RGB>{extents, rowPitch};
assert(mapping.blobSize(0) == rowPitch * extents[0]);
auto view = llama::View{mapping, llama::Array{mem}};

Expand Down
Loading

0 comments on commit da970d8

Please sign in to comment.