From 931ce77f7d6519a13e85379f8a510e59adf38ae3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 3 Jun 2021 18:31:49 +0200 Subject: [PATCH] lift copy implementations from example into llama Move the copy strategies from the viewcopy example into llama. Add a new blobMemcpy strategy. Add documentation. Add unit tests. Fix always defaulting to std::memcpy for parallel memcpy. --- docs/pages/api.rst | 9 + docs/pages/copying.rst | 43 +++- examples/viewcopy/viewcopy.cpp | 317 +++------------------------ include/llama/Copy.hpp | 364 ++++++++++++++++++++++++++++++++ include/llama/llama.hpp | 1 + include/llama/mapping/AoS.hpp | 5 +- include/llama/mapping/AoSoA.hpp | 5 +- include/llama/mapping/SoA.hpp | 3 +- tests/copy.cpp | 113 ++++++++++ 9 files changed, 562 insertions(+), 298 deletions(-) create mode 100644 include/llama/Copy.hpp create mode 100644 tests/copy.cpp diff --git a/docs/pages/api.rst b/docs/pages/api.rst index c58e6ab35f..0780366db2 100644 --- a/docs/pages/api.rst +++ b/docs/pages/api.rst @@ -157,6 +157,15 @@ Data access .. doxygenstruct:: llama::VirtualRecord :members: +Copying +------- + +.. doxygenfunction:: llama::copy +.. doxygenstruct:: llama::Copy + :members: +.. doxygenfunction:: llama::fieldWiseCopy +.. doxygenfunction:: llama::aosoaCommonBlockCopy + Macros ------ diff --git a/docs/pages/copying.rst b/docs/pages/copying.rst index 57aaa5ffce..24a99b3809 100644 --- a/docs/pages/copying.rst +++ b/docs/pages/copying.rst @@ -16,10 +16,6 @@ E.g. when both mappings use SoA, but one time with, one time without padding, or Or two AoSoA mappings with a different inner array length. In those cases an optimized copy procedure is possible, copying larger chunks than mere fields. -.. For the moment, LLAMA implements a generic, field-wise copy with specializations for combinations of SoA and AoSoA mappings, reflect the properties of these. -.. This is sub-optimal, because for every new mapping new specializations are needed. - -.. One thus needs new approaches on how to improve copying because LLAMA can provide the necessary infrastructure: Four solutions exist for this problem: 1. Implement specializations for specific combinations of mappings, which reflect the properties of these. @@ -42,3 +38,42 @@ A good approach could use smaller intermediate views to shuffle a chunk from one The `async copy example `_ tries to show an asynchronous copy/shuffle/compute workflow. This example applies a bluring kernel to an RGB-image, but also may work only on two or one channel instead of all three. Not used channels are not allocated and especially not copied. + + +For the moment, LLAMA implements a generic, field-wise copy with specializations for combinations of SoA and AoSoA mappings, reflect the properties of these. + +.. code-block:: C++ + + auto srcView = llama::allocView(srcMapping); + auto dstView = llama::allocView(dstMapping); + llama::copy(srcView, dstView); // use best copy strategy + +Internally, :cpp:`llama::copy` will choose a copy strategy depending on the source and destination mapping. +This choice is done via template specializations of the :cpp:`llama::Copy` class template. +Users can add specializations of :cpp:`llama::Copy` to provide additional copy stragegies: + +.. code-block:: C++ + + // provide special copy from AoS -> UserDefinedMapping + template + struct Copy< + llama::mapping::AoS, + UserDefinedMapping> + { + template + void operator()( + const View, SrcBlob>& srcView, + View, DstBlob>& dstView, + std::size_t threadId, std::size_t threadCount) { + ... + } + }; + + llama::copy(srcView, dstView); // can delegate to above specialization now + +LLAMA also allows direct access to its two copy implementations, which is mainly used for benchmarking them: + +.. code-block:: C++ + + llama::fieldWiseCopy(srcView, dstView); // explicit field-wise copy + llama::aosoaCommonBlockCopy(srcView, dstView); // explicit SoA/AoSoA copy \ No newline at end of file diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp index b36b304edf..a688a64a1c 100644 --- a/examples/viewcopy/viewcopy.cpp +++ b/examples/viewcopy/viewcopy.cpp @@ -46,45 +46,6 @@ using Particle = llama::Record< using RecordDim = boost::mp11::mp_take_c; // using RecordDim = Event; // WARN: expect long compilation time -namespace llamaex -{ - using namespace llama; - - template - void parallelForEachADCoord(ArrayDims adSize, std::size_t numThreads, Func&& func) - { -#pragma omp parallel for num_threads(numThreads) - for (std::ptrdiff_t i = 0; i < static_cast(adSize[0]); i++) - { - if constexpr (Dim > 1) - forEachADCoord(internal::popFront(adSize), std::forward(func), static_cast(i)); - else - std::forward(func)(ArrayDims{static_cast(i)}); - } - } -} // namespace llamaex - -template -void naive_copy( - const llama::View& srcView, - llama::View& dstView, - std::size_t numThreads = 1) -{ - static_assert(std::is_same_v); - - if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims()) - throw std::runtime_error{"Array dimensions sizes are different"}; - - llamaex::parallelForEachADCoord( - srcView.mapping.arrayDims(), - numThreads, - [&](auto ad) LLAMA_LAMBDA_INLINE - { - llama::forEachLeaf([&](auto coord) LLAMA_LAMBDA_INLINE - { dstView(ad)(coord) = srcView(ad)(coord); }); - }); -} - template void std_copy(const llama::View& srcView, llama::View& dstView) { @@ -140,248 +101,6 @@ auto memcpy_avx2(void* dst, const void* src, size_t n) noexcept -> void* } #endif -inline void parallel_memcpy( - std::byte* dst, - const std::byte* src, - std::size_t size, - decltype(std::memcpy) = std::memcpy, - std::size_t numThreads = 1) -{ - const auto sizePerThread = size / numThreads; - const auto sizeLastThread = sizePerThread + size % numThreads; - -#pragma omp parallel num_threads(numThreads) - { - const auto id = static_cast(omp_get_thread_num()); - const auto sizeThisThread = id == numThreads - 1 ? sizeLastThread : sizePerThread; - std::memcpy(dst + id * sizePerThread, src + id * sizePerThread, sizeThisThread); - } -} - -template < - bool ReadOpt, - typename ArrayDims, - typename RecordDim, - std::size_t LanesSrc, - std::size_t LanesDst, - bool MBSrc, - bool MBDst, - typename SrcView, - typename DstView> -void aosoa_copy_internal(const SrcView& srcView, DstView& dstView, std::size_t numThreads) -{ - if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims()) - throw std::runtime_error{"Array dimensions sizes are different"}; - - constexpr auto srcIsAoSoA = LanesSrc != std::numeric_limits::max(); - constexpr auto dstIsAoSoA = LanesDst != std::numeric_limits::max(); - - static_assert(!srcIsAoSoA || decltype(srcView.storageBlobs)::rank == 1); - static_assert(!dstIsAoSoA || decltype(dstView.storageBlobs)::rank == 1); - - const auto arrayDims = dstView.mapping.arrayDims(); - const auto flatSize = std::reduce(std::begin(arrayDims), std::end(arrayDims), std::size_t{1}, std::multiplies<>{}); - - // the same as AoSoA::blobNrAndOffset but takes a flat array index - auto mapAoSoA = [](std::size_t flatArrayIndex, auto coord, std::size_t Lanes) LLAMA_LAMBDA_INLINE - { - const auto blockIndex = flatArrayIndex / Lanes; - const auto laneIndex = flatArrayIndex % Lanes; - const auto offset = (llama::sizeOf * Lanes) * blockIndex - + llama::offsetOf * Lanes - + sizeof(llama::GetType) * laneIndex; - return offset; - }; - // the same as SoA::blobNrAndOffset but takes a flat array index - auto mapSoA = [&](std::size_t flatArrayIndex, auto coord, bool mb) LLAMA_LAMBDA_INLINE - { - const auto blob = mb * llama::flatRecordCoord; - const auto offset = !mb * llama::offsetOf * flatSize - + sizeof(llama::GetType) * flatArrayIndex; - return llama::NrAndOffset{blob, offset}; - }; - - auto mapSrc = [&srcView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE - { - if constexpr (srcIsAoSoA) - return &srcView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesSrc); - else - { - const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBSrc); - return &srcView.storageBlobs[blob][off]; - } - }; - auto mapDst = [&dstView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE - { - if constexpr (dstIsAoSoA) - return &dstView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesDst); - else - { - const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBDst); - return &dstView.storageBlobs[blob][off]; - } - }; - - constexpr auto L = std::min(LanesSrc, LanesDst); - static_assert(!srcIsAoSoA || LanesSrc % L == 0); - static_assert(!dstIsAoSoA || LanesDst % L == 0); - if constexpr (ReadOpt) - { - // optimized for linear reading - const auto elementsPerThread - = srcIsAoSoA ? flatSize / LanesSrc / numThreads * LanesSrc : flatSize / L / numThreads * L; -#pragma omp parallel num_threads(numThreads) - { - const auto id = static_cast(omp_get_thread_num()); - const auto start = id * elementsPerThread; - const auto stop = id == numThreads - 1 ? flatSize : (id + 1) * elementsPerThread; - - auto copyLBlock = [&](const std::byte*& threadSrc, std::size_t dstIndex, auto coord) LLAMA_LAMBDA_INLINE - { - constexpr auto bytes = L * sizeof(llama::GetType); - std::memcpy(mapDst(dstIndex, coord), threadSrc, bytes); - threadSrc += bytes; - }; - if constexpr (srcIsAoSoA) - { - auto* threadSrc = mapSrc(start, llama::RecordCoord<>{}); - for (std::size_t i = start; i < stop; i += LanesSrc) - llama::forEachLeaf( - [&](auto coord) LLAMA_LAMBDA_INLINE - { - for (std::size_t j = 0; j < LanesSrc; j += L) - copyLBlock(threadSrc, i + j, coord); - }); - } - else - { - llama::forEachLeaf( - [&](auto coord) LLAMA_LAMBDA_INLINE - { - auto* threadSrc = mapSrc(start, coord); - for (std::size_t i = start; i < stop; i += L) - copyLBlock(threadSrc, i, coord); - }); - } - } - } - else - { - // optimized for linear writing - const auto elementsPerThread - = dstIsAoSoA ? ((flatSize / LanesDst) / numThreads) * LanesDst : flatSize / L / numThreads * L; -#pragma omp parallel num_threads(numThreads) - { - const auto id = static_cast(omp_get_thread_num()); - const auto start = id * elementsPerThread; - const auto stop = id == numThreads - 1 ? flatSize : (id + 1) * elementsPerThread; - - auto copyLBlock = [&](std::byte*& threadDst, std::size_t srcIndex, auto coord) LLAMA_LAMBDA_INLINE - { - constexpr auto bytes = L * sizeof(llama::GetType); - std::memcpy(threadDst, mapSrc(srcIndex, coord), bytes); - threadDst += bytes; - }; - if constexpr (dstIsAoSoA) - { - auto* threadDst = mapDst(start, llama::RecordCoord<>{}); - for (std::size_t i = start; i < stop; i += LanesDst) - llama::forEachLeaf( - [&](auto coord) LLAMA_LAMBDA_INLINE - { - for (std::size_t j = 0; j < LanesDst; j += L) - copyLBlock(threadDst, i + j, coord); - }); - } - else - { - llama::forEachLeaf( - [&](auto coord) LLAMA_LAMBDA_INLINE - { - auto* threadDst = mapDst(start, coord); - for (std::size_t i = start; i < stop; i += L) - copyLBlock(threadDst, i, coord); - }); - } - } - } -} - -template < - bool ReadOpt, - typename ArrayDims, - typename RecordDim, - std::size_t LanesSrc, - typename SrcBlobType, - std::size_t LanesDst, - typename DstBlobType> -void aosoa_copy( - const llama::View< - llama::mapping::AoSoA, - SrcBlobType>& srcView, - llama::View< - llama::mapping::AoSoA, - DstBlobType>& dstView, - std::size_t numThreads = 1) -{ - aosoa_copy_internal(srcView, dstView, numThreads); -} - -template < - bool ReadOpt, - typename ArrayDims, - typename RecordDim, - std::size_t LanesSrc, - typename SrcBlobType, - bool DstSeparateBuffers, - typename DstBlobType> -void aosoa_copy( - const llama::View< - llama::mapping::AoSoA, - SrcBlobType>& srcView, - llama::View< - llama::mapping::SoA, - DstBlobType>& dstView, - std::size_t numThreads = 1) -{ - aosoa_copy_internal< - ReadOpt, - ArrayDims, - RecordDim, - LanesSrc, - std::numeric_limits::max(), - false, - DstSeparateBuffers>(srcView, dstView, numThreads); -} - -template < - bool ReadOpt, - typename ArrayDims, - typename RecordDim, - bool SrcSeparateBuffers, - typename SrcBlobType, - std::size_t LanesDst, - typename DstBlobType> -void aosoa_copy( - const llama::View< - llama::mapping::SoA, - SrcBlobType>& srcView, - llama::View< - llama::mapping::AoSoA, - DstBlobType>& dstView, - std::size_t numThreads = 1) -{ - aosoa_copy_internal< - ReadOpt, - ArrayDims, - RecordDim, - std::numeric_limits::max(), - LanesDst, - SrcSeparateBuffers, - false>(srcView, dstView, numThreads); -} - - template auto hash(const llama::View& view) { @@ -472,11 +191,19 @@ set ylabel "throughput [GiB/s]" #endif benchmarkMemcpy( "memcpy(p)", - [&](auto* dst, auto* src, auto size) { parallel_memcpy(dst, src, size, std::memcpy, numThreads); }); + [&](auto* dst, auto* src, auto size) + { +#pragma omp parallel + llama::internal::parallel_memcpy(dst, src, size, omp_get_thread_num(), omp_get_num_threads(), std::memcpy); + }); #ifdef __AVX2__ benchmarkMemcpy( "memcpy_avx2(p)", - [&](auto* dst, auto* src, auto size) { parallel_memcpy(dst, src, size, memcpy_avx2, numThreads); }); + [&](auto* dst, auto* src, auto size) + { +# pragma omp parallel + llama::internal::parallel_memcpy(dst, src, size, omp_get_thread_num(), omp_get_num_threads(), memcpy_avx2); + }); #else plotFile << "0\t"; #endif @@ -513,17 +240,17 @@ set ylabel "throughput [GiB/s]" plotFile << "0\t"; plotFile << "0\t"; plotFile << "0\t"; - benchmarkCopy("naive copy", [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive copy", [](const auto& srcView, auto& dstView) { llama::fieldWiseCopy(srcView, dstView); }); benchmarkCopy("std::copy", [](const auto& srcView, auto& dstView) { std_copy(srcView, dstView); }); constexpr auto oneIsAoSoA = is_AoSoA || is_AoSoA; if constexpr (oneIsAoSoA) { benchmarkCopy( "aosoa copy(r)", - [](const auto& srcView, auto& dstView) { aosoa_copy(srcView, dstView); }); + [](const auto& srcView, auto& dstView) { llama::aosoaCommonBlockCopy(srcView, dstView, true); }); benchmarkCopy( "aosoa copy(w)", - [](const auto& srcView, auto& dstView) { aosoa_copy(srcView, dstView); }); + [](const auto& srcView, auto& dstView) { llama::aosoaCommonBlockCopy(srcView, dstView, false); }); } else { @@ -532,15 +259,27 @@ set ylabel "throughput [GiB/s]" } benchmarkCopy( "naive copy(p)", - [&](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView, numThreads); }); + [&](const auto& srcView, auto& dstView) + { +#pragma omp parallel + llama::fieldWiseCopy(srcView, dstView, omp_get_thread_num(), omp_get_num_threads()); + }); if constexpr (oneIsAoSoA) { benchmarkCopy( "aosoa_copy(r,p)", - [&](const auto& srcView, auto& dstView) { aosoa_copy(srcView, dstView, numThreads); }); + [&](const auto& srcView, auto& dstView) + { +#pragma omp parallel + llama::aosoaCommonBlockCopy(srcView, dstView, true, omp_get_thread_num(), omp_get_num_threads()); + }); benchmarkCopy( "aosoa_copy(w,p)", - [&](const auto& srcView, auto& dstView) { aosoa_copy(srcView, dstView, numThreads); }); + [&](const auto& srcView, auto& dstView) + { +#pragma omp parallel + llama::aosoaCommonBlockCopy(srcView, dstView, false, omp_get_thread_num(), omp_get_num_threads()); + }); } else { diff --git a/include/llama/Copy.hpp b/include/llama/Copy.hpp new file mode 100644 index 0000000000..4075fb4baf --- /dev/null +++ b/include/llama/Copy.hpp @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "View.hpp" +#include "mapping/AoSoA.hpp" +#include "mapping/SoA.hpp" + +#include +#include + +namespace llama +{ + namespace internal + { + inline void parallel_memcpy( + std::byte* dst, + const std::byte* src, + std::size_t size, + std::size_t threadId = 0, + std::size_t threadCount = 1, + decltype(std::memcpy) memcpy = std::memcpy) + { + const auto sizePerThread = size / threadCount; + const auto sizeLastThread = sizePerThread + size % threadCount; + const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread; + memcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread); + } + } // namespace internal + + /// @brief Direct memcpy from source view blobs to destination view blobs. Both views need to have the same mappings + /// with the same array dimensions. + /// @param threadId Optional. Thread id in case of multi-threaded copy. + /// @param threadCount Optional. Thread count in case of multi-threaded copy. + template + void blobMemcpy( + const View& srcView, + View& dstView, + std::size_t threadId = 0, + std::size_t threadCount = 1) + { + // TODO: we do not verify if the mappings have other runtime state than the array dimensions + if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims()) + throw std::runtime_error{"Array dimensions sizes are different"}; + + // TODO: this is maybe not the best parallel copying strategy + for (std::size_t i = 0; i < Mapping::blobCount; i++) + internal::parallel_memcpy( + &dstView.storageBlobs[i][0], + &srcView.storageBlobs[i][0], + dstView.mapping.blobSize(i), + threadId, + threadCount); + } + + /// @brief Field-wise copy from source to destination view. Both views need to have the same array and record + /// dimensions. + /// @param threadId Optional. Thread id in case of multi-threaded copy. + /// @param threadCount Optional. Thread count in case of multi-threaded copy. + template + void fieldWiseCopy( + const View& srcView, + View& dstView, + std::size_t threadId = 0, + std::size_t threadCount = 1) + { + // TODO: think if we can remove this restriction + static_assert(std::is_same_v); + + if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims()) + throw std::runtime_error{"Array dimensions sizes are different"}; + + auto copyOne = [&](auto ad) LLAMA_LAMBDA_INLINE + { + forEachLeaf([&](auto coord) LLAMA_LAMBDA_INLINE + { dstView(ad)(coord) = srcView(ad)(coord); }); + }; + + constexpr auto dims = std::decay_t::rank; + const auto& adSize = srcView.mapping.arrayDims(); + const auto workPerThread = (adSize[0] + threadCount - 1) / threadCount; + const auto start = threadId * workPerThread; + const auto end = std::min((threadId + 1) * workPerThread, adSize[0]); + for (auto i = threadId * workPerThread; i < end; i++) + { + if constexpr (dims > 1) + forEachADCoord(internal::popFront(adSize), copyOne, static_cast(i)); + else + copyOne(ArrayDims{static_cast(i)}); + } + } + + namespace internal + { + template + inline constexpr std::size_t aosoaLanes = 0; + + template + inline constexpr std::size_t aosoaLanes< + mapping::SoA> = std:: + numeric_limits::max(); + + template + inline constexpr std::size_t + aosoaLanes> = Lanes; + } // namespace internal + + template + void aosoaCommonBlockCopy( + const View& srcView, + View& dstView, + bool readOpt, + std::size_t threadId = 0, + std::size_t threadCount = 1) + { + // TODO: think if we can remove this restriction + static_assert(std::is_same_v); + static_assert( + std::is_same_v< + typename SrcMapping::LinearizeArrayDimsFunctor, + typename DstMapping::LinearizeArrayDimsFunctor>, + "Source and destination mapping need to use the same array dimensions linearizer"); + using RecordDim = typename SrcMapping::RecordDim; + + constexpr bool MBSrc = SrcMapping::blobCount > 1; + constexpr bool MBDst = DstMapping::blobCount > 1; + constexpr auto LanesSrc = internal::aosoaLanes; + constexpr auto LanesDst = internal::aosoaLanes; + + if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims()) + throw std::runtime_error{"Array dimensions sizes are different"}; + + constexpr auto srcIsAoSoA = LanesSrc != std::numeric_limits::max(); + constexpr auto dstIsAoSoA = LanesDst != std::numeric_limits::max(); + + static_assert(srcIsAoSoA || dstIsAoSoA, "At least one of the mappings must be an AoSoA mapping"); + static_assert(!srcIsAoSoA || decltype(srcView.storageBlobs)::rank == 1); + static_assert(!dstIsAoSoA || decltype(dstView.storageBlobs)::rank == 1); + + const auto arrayDims = dstView.mapping.arrayDims(); + const auto flatSize + = std::reduce(std::begin(arrayDims), std::end(arrayDims), std::size_t{1}, std::multiplies<>{}); + + // the same as AoSoA::blobNrAndOffset but takes a flat array index + auto mapAoSoA = [](std::size_t flatArrayIndex, auto coord, std::size_t Lanes) LLAMA_LAMBDA_INLINE + { + const auto blockIndex = flatArrayIndex / Lanes; + const auto laneIndex = flatArrayIndex % Lanes; + const auto offset = (sizeOf * Lanes) * blockIndex + offsetOf * Lanes + + sizeof(GetType) * laneIndex; + return offset; + }; + // the same as SoA::blobNrAndOffset but takes a flat array index + auto mapSoA = [&](std::size_t flatArrayIndex, auto coord, bool mb) LLAMA_LAMBDA_INLINE + { + const auto blob = mb * flatRecordCoord; + const auto offset = !mb * offsetOf * flatSize + + sizeof(GetType) * flatArrayIndex; + return NrAndOffset{blob, offset}; + }; + + auto mapSrc = [&srcView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE + { + if constexpr (srcIsAoSoA) + return &srcView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesSrc); + else + { + const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBSrc); + return &srcView.storageBlobs[blob][off]; + } + }; + auto mapDst = [&dstView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE + { + if constexpr (dstIsAoSoA) + return &dstView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesDst); + else + { + const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBDst); + return &dstView.storageBlobs[blob][off]; + } + }; + + constexpr auto L = std::min(LanesSrc, LanesDst); + static_assert(!srcIsAoSoA || LanesSrc % L == 0); + static_assert(!dstIsAoSoA || LanesDst % L == 0); + if (readOpt) + { + // optimized for linear reading + const auto elementsPerThread + = srcIsAoSoA ? flatSize / LanesSrc / threadCount * LanesSrc : flatSize / L / threadCount * L; + { + const auto start = threadId * elementsPerThread; + const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread; + + auto copyLBlock = [&](const std::byte*& threadSrc, std::size_t dstIndex, auto coord) LLAMA_LAMBDA_INLINE + { + constexpr auto bytes = L * sizeof(GetType); + std::memcpy(mapDst(dstIndex, coord), threadSrc, bytes); + threadSrc += bytes; + }; + if constexpr (srcIsAoSoA) + { + auto* threadSrc = mapSrc(start, RecordCoord<>{}); + for (std::size_t i = start; i < stop; i += LanesSrc) + forEachLeaf( + [&](auto coord) LLAMA_LAMBDA_INLINE + { + for (std::size_t j = 0; j < LanesSrc; j += L) + copyLBlock(threadSrc, i + j, coord); + }); + } + else + { + forEachLeaf( + [&](auto coord) LLAMA_LAMBDA_INLINE + { + auto* threadSrc = mapSrc(start, coord); + for (std::size_t i = start; i < stop; i += L) + copyLBlock(threadSrc, i, coord); + }); + } + } + } + else + { + // optimized for linear writing + const auto elementsPerThread + = dstIsAoSoA ? ((flatSize / LanesDst) / threadCount) * LanesDst : flatSize / L / threadCount * L; + { + const auto start = threadId * elementsPerThread; + const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread; + + auto copyLBlock = [&](std::byte*& threadDst, std::size_t srcIndex, auto coord) LLAMA_LAMBDA_INLINE + { + constexpr auto bytes = L * sizeof(GetType); + std::memcpy(threadDst, mapSrc(srcIndex, coord), bytes); + threadDst += bytes; + }; + if constexpr (dstIsAoSoA) + { + auto* threadDst = mapDst(start, RecordCoord<>{}); + for (std::size_t i = start; i < stop; i += LanesDst) + forEachLeaf( + [&](auto coord) LLAMA_LAMBDA_INLINE + { + for (std::size_t j = 0; j < LanesDst; j += L) + copyLBlock(threadDst, i + j, coord); + }); + } + else + { + forEachLeaf( + [&](auto coord) LLAMA_LAMBDA_INLINE + { + auto* threadDst = mapDst(start, coord); + for (std::size_t i = start; i < stop; i += L) + copyLBlock(threadDst, i, coord); + }); + } + } + } + } + + /// @brief Generic implementation of \ref copy. LLAMA provides several specializations of this construct for + /// specific mappings. Users are encourages to also specialize this template with better copy algorithms. + template + struct Copy + { + template + void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const + { + fieldWiseCopy(srcView, dstView, threadId, threadCount); + } + }; + + template + struct Copy + { + template + void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const + { + blobMemcpy(srcView, dstView, threadId, threadCount); + } + }; + + template < + typename ArrayDims, + typename RecordDim, + typename LinearizeArrayDims, + std::size_t LanesSrc, + std::size_t LanesDst> + struct Copy< + mapping::AoSoA, + mapping::AoSoA, + std::enable_if_t> + { + template + void operator()( + const View, SrcBlob>& srcView, + View, DstBlob>& dstView, + std::size_t threadId, + std::size_t threadCount) + { + constexpr auto readOpt = true; // TODO: how to choose? + aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount); + } + }; + + template < + typename ArrayDims, + typename RecordDim, + typename LinearizeArrayDims, + std::size_t LanesSrc, + bool DstSeparateBuffers> + struct Copy< + mapping::AoSoA, + mapping::SoA> + { + template + void operator()( + const View, SrcBlob>& srcView, + View, DstBlob>& dstView, + std::size_t threadId, + std::size_t threadCount) + { + constexpr auto readOpt = true; // TODO: how to choose? + aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount); + } + }; + + template < + typename ArrayDims, + typename RecordDim, + typename LinearizeArrayDims, + std::size_t LanesDst, + bool SrcSeparateBuffers> + struct Copy< + mapping::SoA, + mapping::AoSoA> + { + template + void operator()( + const View, SrcBlob>& srcView, + View, DstBlob>& dstView, + std::size_t threadId, + std::size_t threadCount) + { + constexpr auto readOpt = true; // TODO: how to choose? + aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount); + } + }; + + /// Copy data from source view to destination view. Both views need to have the same array and record + /// dimensions. Delegates to \ref Copy to choose an implementation. + template + void copy( + const View& srcView, + View& dstView, + std::size_t threadId = 0, + std::size_t threadCount = 1) + { + Copy{}(srcView, dstView, threadId, threadCount); + } +} // namespace llama diff --git a/include/llama/llama.hpp b/include/llama/llama.hpp index c552ac7170..fa135dc765 100644 --- a/include/llama/llama.hpp +++ b/include/llama/llama.hpp @@ -33,6 +33,7 @@ #include "ArrayDimsIndexRange.hpp" #include "BlobAllocators.hpp" +#include "Copy.hpp" #include "Core.hpp" #include "Vector.hpp" #include "View.hpp" diff --git a/include/llama/mapping/AoS.hpp b/include/llama/mapping/AoS.hpp index 71c0300913..7b794f5374 100644 --- a/include/llama/mapping/AoS.hpp +++ b/include/llama/mapping/AoS.hpp @@ -10,17 +10,18 @@ namespace llama::mapping /// Array of struct mapping. Used to create a \ref View via \ref allocView. /// \tparam AlignAndPad If true, padding bytes are inserted to guarantee that struct members are properly aligned. /// If false, struct members are tighly packed. - /// \tparam LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and + /// \tparam T_LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. template < typename T_ArrayDims, typename T_RecordDim, bool AlignAndPad = true, - typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp> + typename T_LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp> struct AoS { using ArrayDims = T_ArrayDims; using RecordDim = T_RecordDim; + using LinearizeArrayDimsFunctor = T_LinearizeArrayDimsFunctor; static constexpr std::size_t blobCount = 1; constexpr AoS() = default; diff --git a/include/llama/mapping/AoSoA.hpp b/include/llama/mapping/AoSoA.hpp index 1f29f0f7e8..d016fab549 100644 --- a/include/llama/mapping/AoSoA.hpp +++ b/include/llama/mapping/AoSoA.hpp @@ -26,17 +26,18 @@ namespace llama::mapping /// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView. /// \tparam Lanes The size of the inner arrays of this array of struct of arrays. - /// \tparam LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and + /// \tparam T_LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. template < typename T_ArrayDims, typename T_RecordDim, std::size_t Lanes, - typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp> + typename T_LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp> struct AoSoA { using ArrayDims = T_ArrayDims; using RecordDim = T_RecordDim; + using LinearizeArrayDimsFunctor = T_LinearizeArrayDimsFunctor; static constexpr std::size_t blobCount = 1; constexpr AoSoA() = default; diff --git a/include/llama/mapping/SoA.hpp b/include/llama/mapping/SoA.hpp index 640a7c3262..dc41b593e6 100644 --- a/include/llama/mapping/SoA.hpp +++ b/include/llama/mapping/SoA.hpp @@ -17,11 +17,12 @@ namespace llama::mapping typename T_ArrayDims, typename T_RecordDim, bool SeparateBuffers = true, - typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp> + typename T_LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp> struct SoA { using ArrayDims = T_ArrayDims; using RecordDim = T_RecordDim; + using LinearizeArrayDimsFunctor = T_LinearizeArrayDimsFunctor; static constexpr std::size_t blobCount = SeparateBuffers ? boost::mp11::mp_size>::value : 1; diff --git a/tests/copy.cpp b/tests/copy.cpp new file mode 100644 index 0000000000..a766dd0a33 --- /dev/null +++ b/tests/copy.cpp @@ -0,0 +1,113 @@ +#include "common.h" + +#include +#include + +namespace +{ + using ArrayDims = llama::ArrayDims<2>; + using RecordDim = Vec3I; + + template + void testCopy(CopyFunc copy) + { + const auto viewSize = ArrayDims{4, 8}; + const auto srcMapping = SrcMapping(viewSize); + auto srcView = llama::allocView(srcMapping); + auto value = std::size_t{0}; + for (auto ad : llama::ArrayDimsIndexRange{srcMapping.arrayDims()}) + llama::forEachLeaf( + [&](auto coord) + { + srcView(ad)(coord) = value; + value++; + }); + + auto dstView = llama::allocView(DstMapping(viewSize)); + copy(srcView, dstView); + + value = 0; + for (auto ad : llama::ArrayDimsIndexRange{srcMapping.arrayDims()}) + llama::forEachLeaf( + [&](auto coord) + { + CHECK(dstView(ad)(coord) == value); + value++; + }); + } + + template + inline constexpr bool isSoA = false; + + template + inline constexpr bool + isSoA> = true; + + // Do not test all combinations as this exlodes the unit test compile and runtime. + using AoSMappings = boost::mp11::mp_list< + llama::mapping::AoS, + // llama::mapping::AoS, + // llama::mapping::AoS, + llama::mapping::AoS>; + + using OtherMappings = boost::mp11::mp_list< + llama::mapping::SoA, + // llama::mapping::SoA, + // llama::mapping::SoA, + llama::mapping::SoA, + llama::mapping::AoSoA, + // llama::mapping::AoSoA, + // llama::mapping::AoSoA, + llama::mapping::AoSoA>; + + using AllMappings = boost::mp11::mp_append; + + using AllMappingsProduct = boost::mp11::mp_product; + + template + using BothAreSoAOrHaveDifferentLinearizer = std::bool_constant< + (isSoA> && isSoA>) + || !std::is_same_v< + typename boost::mp11::mp_first::LinearizeArrayDimsFunctor, + typename boost::mp11::mp_second::LinearizeArrayDimsFunctor>>; + + using AoSoAMappingsProduct = boost::mp11::mp_remove_if< + boost::mp11::mp_product, + BothAreSoAOrHaveDifferentLinearizer>; +} // namespace + +TEMPLATE_LIST_TEST_CASE("copy", "", AllMappingsProduct) +{ + using SrcMapping = boost::mp11::mp_first; + using DstMapping = boost::mp11::mp_second; + testCopy([](const auto& srcView, auto& dstView) { llama::copy(srcView, dstView); }); +} + +TEMPLATE_LIST_TEST_CASE("blobMemcpy", "", AllMappings) +{ + testCopy([](const auto& srcView, auto& dstView) { llama::blobMemcpy(srcView, dstView); }); +} + +TEMPLATE_LIST_TEST_CASE("fieldWiseCopy", "", AllMappingsProduct) +{ + using SrcMapping = boost::mp11::mp_first; + using DstMapping = boost::mp11::mp_second; + testCopy([](const auto& srcView, auto& dstView) + { llama::fieldWiseCopy(srcView, dstView); }); +} + +TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy.readOpt", "", AoSoAMappingsProduct) +{ + using SrcMapping = boost::mp11::mp_first; + using DstMapping = boost::mp11::mp_second; + testCopy([](const auto& srcView, auto& dstView) + { llama::aosoaCommonBlockCopy(srcView, dstView, true); }); +} + +TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy.writeOpt", "", AoSoAMappingsProduct) +{ + using SrcMapping = boost::mp11::mp_first; + using DstMapping = boost::mp11::mp_second; + testCopy([](const auto& srcView, auto& dstView) + { llama::aosoaCommonBlockCopy(srcView, dstView, false); }); +}