Skip to content

Commit

Permalink
Merge pull request #120 from bernhardmgruber/pcopy
Browse files Browse the repository at this point in the history
Add parallel versions of copy algorithms in viewcopy example
  • Loading branch information
bernhardmgruber authored Nov 9, 2020
2 parents 2f60140 + eeabdac commit 33d1409
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 30 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
mkdir build
cd build
export BOOST_ROOT=$BOOST_ROOT_1_72_0
CXX=g++-9 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
CXX=g++-10 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
- name: build
run: cmake --build build
- name: tests
Expand Down
129 changes: 100 additions & 29 deletions examples/viewcopy/viewcopy.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#include <boost/functional/hash.hpp>
#include <boost/mp11.hpp>
#include <boost/range/irange.hpp>
#include <chrono>
#include <execution>
#include <llama/llama.hpp>
#include <numeric>
#include <string_view>
#include <thread>

// clang-format off
namespace tag
Expand Down Expand Up @@ -31,22 +34,43 @@ using Particle = llama::DS<
>;
// clang-format on

template <typename Mapping1, typename BlobType1, typename Mapping2, typename BlobType2>
void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Mapping2, BlobType2>& dstView)
template <
typename Mapping1,
typename BlobType1,
typename Mapping2,
typename BlobType2,
typename Ex = std::execution::sequenced_policy>
void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Mapping2, BlobType2>& dstView, Ex ex = {})
{
static_assert(std::is_same_v<typename Mapping1::DatumDomain, typename Mapping2::DatumDomain>);

if (srcView.mapping.arrayDomainSize != dstView.mapping.arrayDomainSize)
throw std::runtime_error{"UserDomain sizes are different"};

for (auto ad : llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize})
auto r = llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize};
std::for_each(ex, std::begin(r), std::end(r), [&](auto ad) {
llama::forEach<typename Mapping1::DatumDomain>([&](auto coord) {
dstView(ad)(coord) = srcView(ad)(coord);
// std::memcpy(
// &dstView(ad)(coord),
// &srcView(ad)(coord),
// sizeof(llama::GetType<typename Mapping1::DatumDomain, decltype(coord)>));
});
});
}

void parallel_memcpy(std::byte* dst, const std::byte* src, std::size_t size)
{
const auto threads = std::size_t{std::thread::hardware_concurrency()};
const auto sizePerThread = size / threads;
const auto sizeLastThread = sizePerThread + size % threads;
auto r = boost::irange(threads);
std::for_each(std::execution::par, std::begin(r), std::end(r), [&](auto id) {
std::memcpy(
dst + id * sizePerThread,
src + id * sizePerThread,
id == threads - 1 ? sizeLastThread : sizePerThread);
});
}

template <
Expand All @@ -56,14 +80,16 @@ template <
std::size_t LanesSrc,
typename BlobType1,
std::size_t LanesDst,
typename BlobType2>
typename BlobType2,
typename Ex = std::execution::sequenced_policy>
void aosoa_copy(
const llama::View<
llama::mapping::AoSoA<ArrayDomain, DatumDomain, LanesSrc, llama::mapping::LinearizeArrayDomainCpp>,
BlobType1>& srcView,
llama::View<
llama::mapping::AoSoA<ArrayDomain, DatumDomain, LanesDst, llama::mapping::LinearizeArrayDomainCpp>,
BlobType2>& dstView)
BlobType2>& dstView,
Ex ex = {})
{
static_assert(srcView.storageBlobs.rank == 1);
static_assert(dstView.storageBlobs.rank == 1);
Expand All @@ -90,37 +116,59 @@ void aosoa_copy(
return offset;
};

const auto threads = [&]() -> std::size_t {
if constexpr (std::is_same_v<Ex, std::execution::sequenced_policy>)
return 1u;
else
return std::thread::hardware_concurrency();
}();
const auto threadIds = boost::irange(threads);
if constexpr (ReadOpt)
{
// optimized for linear reading
for (std::size_t i = 0; i < flatSize; i += LanesSrc)
{
llama::forEach<DatumDomain>([&](auto coord) {
constexpr auto L = std::min(LanesSrc, LanesDst);
for (std::size_t j = 0; j < LanesSrc; j += L)
{
constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
std::memcpy(&dst[map(i + j, coord, LanesDst)], src, bytes);
src += bytes;
}
});
}
const auto elementsPerThread = ((flatSize / LanesSrc) / threads) * LanesSrc;
std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) {
const auto start = id * elementsPerThread;
const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread;
auto* threadSrc = src + map(start, llama::DatumCoord<>{}, LanesSrc);

for (std::size_t i = start; i < stop; i += LanesSrc)
{
llama::forEach<DatumDomain>([&](auto coord) {
constexpr auto L = std::min(LanesSrc, LanesDst);
for (std::size_t j = 0; j < LanesSrc; j += L)
{
constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
std::memcpy(&dst[map(i + j, coord, LanesDst)], threadSrc, bytes);
threadSrc += bytes;
}
});
}
});
}
else
{
// optimized for linear writing
for (std::size_t i = 0; i < flatSize; i += LanesDst)
{
llama::forEach<DatumDomain>([&](auto coord) {
constexpr auto L = std::min(LanesSrc, LanesDst);
for (std::size_t j = 0; j < LanesDst; j += L)
{
constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
std::memcpy(dst, &src[map(i + j, coord, LanesSrc)], bytes);
dst += bytes;
}
});
}
const auto elementsPerThread = ((flatSize / LanesDst) / threads) * LanesDst;
std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) {
const auto start = id * elementsPerThread;
const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread;

auto* threadDst = dst + map(start, llama::DatumCoord<>{}, LanesDst);

for (std::size_t i = start; i < stop; i += LanesDst)
{
llama::forEach<DatumDomain>([&](auto coord) {
constexpr auto L = std::min(LanesSrc, LanesDst);
for (std::size_t j = 0; j < LanesDst; j += L)
{
constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
std::memcpy(threadDst, &src[map(i + j, coord, LanesSrc)], bytes);
threadDst += bytes;
}
});
}
});
}
}

Expand Down Expand Up @@ -180,11 +228,22 @@ int main(int argc, char** argv)
benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
naive_copy(srcView, dstView);
});
benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
naive_copy(srcView, dstView, std::execution::par);
});
benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
static_assert(srcView.storageBlobs.rank == 1);
static_assert(dstView.storageBlobs.rank == 1);
std::memcpy(dstView.storageBlobs[0].data(), srcView.storageBlobs[0].data(), dstView.storageBlobs[0].size());
});
benchmarkCopy("memcpy(p) ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
static_assert(srcView.storageBlobs.rank == 1);
static_assert(dstView.storageBlobs.rank == 1);
parallel_memcpy(
dstView.storageBlobs[0].data(),
srcView.storageBlobs[0].data(),
dstView.storageBlobs[0].size());
});
}

{
Expand All @@ -196,6 +255,9 @@ int main(int argc, char** argv)
benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
naive_copy(srcView, dstView);
});
benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
naive_copy(srcView, dstView, std::execution::par);
});
benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
static_assert(srcView.storageBlobs.rank == 1);
static_assert(dstView.storageBlobs.rank == 1);
Expand All @@ -220,6 +282,9 @@ int main(int argc, char** argv)
benchmarkCopy("naive_copy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
naive_copy(srcView, dstView);
});
benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
naive_copy(srcView, dstView, std::execution::par);
});
benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
static_assert(srcView.storageBlobs.rank == 1);
static_assert(dstView.storageBlobs.rank == 1);
Expand All @@ -231,5 +296,11 @@ int main(int argc, char** argv)
benchmarkCopy("aosoa_copy(w)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
aosoa_copy<false>(srcView, dstView);
});
benchmarkCopy("aosoa_copy(r,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
aosoa_copy<true>(srcView, dstView, std::execution::par);
});
benchmarkCopy("aosoa_copy(w,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
aosoa_copy<false>(srcView, dstView, std::execution::par);
});
});
}

0 comments on commit 33d1409

Please sign in to comment.