From 931ce77f7d6519a13e85379f8a510e59adf38ae3 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 3 Jun 2021 18:31:49 +0200
Subject: [PATCH] lift copy implementations from example into llama

Move the copy strategies from the viewcopy example into llama.
Add a new blobMemcpy strategy.
Add documentation.
Add unit tests.
Fix always defaulting to std::memcpy for parallel memcpy.
---
 docs/pages/api.rst              |   9 +
 docs/pages/copying.rst          |  43 +++-
 examples/viewcopy/viewcopy.cpp  | 317 +++------------------------
 include/llama/Copy.hpp          | 364 ++++++++++++++++++++++++++++++++
 include/llama/llama.hpp         |   1 +
 include/llama/mapping/AoS.hpp   |   5 +-
 include/llama/mapping/AoSoA.hpp |   5 +-
 include/llama/mapping/SoA.hpp   |   3 +-
 tests/copy.cpp                  | 113 ++++++++++
 9 files changed, 562 insertions(+), 298 deletions(-)
 create mode 100644 include/llama/Copy.hpp
 create mode 100644 tests/copy.cpp
diff --git a/docs/pages/api.rst b/docs/pages/api.rst
index c58e6ab35f..0780366db2 100644
--- a/docs/pages/api.rst
+++ b/docs/pages/api.rst
@@ -157,6 +157,15 @@ Data access
 .. doxygenstruct:: llama::VirtualRecord
    :members:
 
+Copying
+-------
+
+.. doxygenfunction:: llama::copy
+.. doxygenstruct:: llama::Copy
+   :members:
+.. doxygenfunction:: llama::fieldWiseCopy
+.. doxygenfunction:: llama::aosoaCommonBlockCopy
+
 Macros
 ------
 
diff --git a/docs/pages/copying.rst b/docs/pages/copying.rst
index 57aaa5ffce..24a99b3809 100644
--- a/docs/pages/copying.rst
+++ b/docs/pages/copying.rst
@@ -16,10 +16,6 @@ E.g. when both mappings use SoA, but one time with, one time without padding, or
 Or two AoSoA mappings with a different inner array length.
 In those cases an optimized copy procedure is possible, copying larger chunks than mere fields.
 
-.. For the moment, LLAMA implements a generic, field-wise copy with specializations for combinations of SoA and AoSoA mappings, reflect the properties of these.
-.. This is sub-optimal, because for every new mapping new specializations are needed.
-
-.. One thus needs new approaches on how to improve copying because LLAMA can provide the necessary infrastructure:
 Four solutions exist for this problem:
 
 1. Implement specializations for specific combinations of mappings, which reflect the properties of these.
@@ -42,3 +38,42 @@ A good approach could use smaller intermediate views to shuffle a chunk from one
 The `async copy example <https://github.com/alpaka-group/llama/blob/master/examples/asynccopy/asynccopy.cpp>`_ tries to show an asynchronous copy/shuffle/compute workflow.
 This example applies a bluring kernel to an RGB-image, but also may work only on two or one channel instead of all three.
 Not used channels are not allocated and especially not copied.
+
+
+For the moment, LLAMA implements a generic, field-wise copy with specializations for combinations of SoA and AoSoA mappings, reflect the properties of these.
+
+.. code-block:: C++
+
+    auto srcView = llama::allocView(srcMapping);
+    auto dstView = llama::allocView(dstMapping);
+    llama::copy(srcView, dstView); // use best copy strategy
+
+Internally, :cpp:`llama::copy` will choose a copy strategy depending on the source and destination mapping.
+This choice is done via template specializations of the :cpp:`llama::Copy` class template.
+Users can add specializations of :cpp:`llama::Copy` to provide additional copy stragegies:
+
+.. code-block:: C++
+
+    // provide special copy from AoS -> UserDefinedMapping
+    template <typename ArrayDims, typename RecordDim, bool Aligned, typename LinearizeArrayDims>
+    struct Copy<
+        llama::mapping::AoS<ArrayDims, RecordDim, Aligned, LinearizeArrayDims>,
+        UserDefinedMapping<ArrayDims, RecordDim>>
+    {
+        template <typename SrcBlob, typename DstBlob>
+        void operator()(
+            const View<mapping::AoS<ArrayDims, RecordDim, Aligned, LinearizeArrayDims>, SrcBlob>& srcView,
+            View<mapping::SoA<ArrayDims, RecordDim, DstSeparateBuffers, LinearizeArrayDims>, DstBlob>& dstView,
+            std::size_t threadId, std::size_t threadCount) {
+            ...
+        }
+    };
+
+    llama::copy(srcView, dstView); // can delegate to above specialization now
+
+LLAMA also allows direct access to its two copy implementations, which is mainly used for benchmarking them:
+
+.. code-block:: C++
+
+    llama::fieldWiseCopy(srcView, dstView); // explicit field-wise copy
+    llama::aosoaCommonBlockCopy(srcView, dstView); // explicit SoA/AoSoA copy
\ No newline at end of file
diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp
index b36b304edf..a688a64a1c 100644
--- a/examples/viewcopy/viewcopy.cpp
+++ b/examples/viewcopy/viewcopy.cpp
@@ -46,45 +46,6 @@ using Particle = llama::Record<
 using RecordDim = boost::mp11::mp_take_c<Event, 20>;
 // using RecordDim = Event; // WARN: expect long compilation time
 
-namespace llamaex
-{
-    using namespace llama;
-
-    template <std::size_t Dim, typename Func>
-    void parallelForEachADCoord(ArrayDims<Dim> adSize, std::size_t numThreads, Func&& func)
-    {
-#pragma omp parallel for num_threads(numThreads)
-        for (std::ptrdiff_t i = 0; i < static_cast<std::ptrdiff_t>(adSize[0]); i++)
-        {
-            if constexpr (Dim > 1)
-                forEachADCoord(internal::popFront(adSize), std::forward<Func>(func), static_cast<std::size_t>(i));
-            else
-                std::forward<Func>(func)(ArrayDims<Dim>{static_cast<std::size_t>(i)});
-        }
-    }
-} // namespace llamaex
-
-template <typename SrcMapping, typename SrcBlobType, typename DstMapping, typename DstBlobType>
-void naive_copy(
-    const llama::View<SrcMapping, SrcBlobType>& srcView,
-    llama::View<DstMapping, DstBlobType>& dstView,
-    std::size_t numThreads = 1)
-{
-    static_assert(std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>);
-
-    if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims())
-        throw std::runtime_error{"Array dimensions sizes are different"};
-
-    llamaex::parallelForEachADCoord(
-        srcView.mapping.arrayDims(),
-        numThreads,
-        [&](auto ad) LLAMA_LAMBDA_INLINE
-        {
-            llama::forEachLeaf<typename DstMapping::RecordDim>([&](auto coord) LLAMA_LAMBDA_INLINE
-                                                               { dstView(ad)(coord) = srcView(ad)(coord); });
-        });
-}
-
 template <typename SrcMapping, typename SrcBlobType, typename DstMapping, typename DstBlobType>
 void std_copy(const llama::View<SrcMapping, SrcBlobType>& srcView, llama::View<DstMapping, DstBlobType>& dstView)
 {
@@ -140,248 +101,6 @@ auto memcpy_avx2(void* dst, const void* src, size_t n) noexcept -> void*
 }
 #endif
 
-inline void parallel_memcpy(
-    std::byte* dst,
-    const std::byte* src,
-    std::size_t size,
-    decltype(std::memcpy) = std::memcpy,
-    std::size_t numThreads = 1)
-{
-    const auto sizePerThread = size / numThreads;
-    const auto sizeLastThread = sizePerThread + size % numThreads;
-
-#pragma omp parallel num_threads(numThreads)
-    {
-        const auto id = static_cast<std::size_t>(omp_get_thread_num());
-        const auto sizeThisThread = id == numThreads - 1 ? sizeLastThread : sizePerThread;
-        std::memcpy(dst + id * sizePerThread, src + id * sizePerThread, sizeThisThread);
-    }
-}
-
-template <
-    bool ReadOpt,
-    typename ArrayDims,
-    typename RecordDim,
-    std::size_t LanesSrc,
-    std::size_t LanesDst,
-    bool MBSrc,
-    bool MBDst,
-    typename SrcView,
-    typename DstView>
-void aosoa_copy_internal(const SrcView& srcView, DstView& dstView, std::size_t numThreads)
-{
-    if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims())
-        throw std::runtime_error{"Array dimensions sizes are different"};
-
-    constexpr auto srcIsAoSoA = LanesSrc != std::numeric_limits<std::size_t>::max();
-    constexpr auto dstIsAoSoA = LanesDst != std::numeric_limits<std::size_t>::max();
-
-    static_assert(!srcIsAoSoA || decltype(srcView.storageBlobs)::rank == 1);
-    static_assert(!dstIsAoSoA || decltype(dstView.storageBlobs)::rank == 1);
-
-    const auto arrayDims = dstView.mapping.arrayDims();
-    const auto flatSize = std::reduce(std::begin(arrayDims), std::end(arrayDims), std::size_t{1}, std::multiplies<>{});
-
-    // the same as AoSoA::blobNrAndOffset but takes a flat array index
-    auto mapAoSoA = [](std::size_t flatArrayIndex, auto coord, std::size_t Lanes) LLAMA_LAMBDA_INLINE
-    {
-        const auto blockIndex = flatArrayIndex / Lanes;
-        const auto laneIndex = flatArrayIndex % Lanes;
-        const auto offset = (llama::sizeOf<RecordDim> * Lanes) * blockIndex
-            + llama::offsetOf<RecordDim, decltype(coord)> * Lanes
-            + sizeof(llama::GetType<RecordDim, decltype(coord)>) * laneIndex;
-        return offset;
-    };
-    // the same as SoA::blobNrAndOffset but takes a flat array index
-    auto mapSoA = [&](std::size_t flatArrayIndex, auto coord, bool mb) LLAMA_LAMBDA_INLINE
-    {
-        const auto blob = mb * llama::flatRecordCoord<RecordDim, decltype(coord)>;
-        const auto offset = !mb * llama::offsetOf<RecordDim, decltype(coord)> * flatSize
-            + sizeof(llama::GetType<RecordDim, decltype(coord)>) * flatArrayIndex;
-        return llama::NrAndOffset{blob, offset};
-    };
-
-    auto mapSrc = [&srcView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE
-    {
-        if constexpr (srcIsAoSoA)
-            return &srcView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesSrc);
-        else
-        {
-            const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBSrc);
-            return &srcView.storageBlobs[blob][off];
-        }
-    };
-    auto mapDst = [&dstView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE
-    {
-        if constexpr (dstIsAoSoA)
-            return &dstView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesDst);
-        else
-        {
-            const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBDst);
-            return &dstView.storageBlobs[blob][off];
-        }
-    };
-
-    constexpr auto L = std::min(LanesSrc, LanesDst);
-    static_assert(!srcIsAoSoA || LanesSrc % L == 0);
-    static_assert(!dstIsAoSoA || LanesDst % L == 0);
-    if constexpr (ReadOpt)
-    {
-        // optimized for linear reading
-        const auto elementsPerThread
-            = srcIsAoSoA ? flatSize / LanesSrc / numThreads * LanesSrc : flatSize / L / numThreads * L;
-#pragma omp parallel num_threads(numThreads)
-        {
-            const auto id = static_cast<std::size_t>(omp_get_thread_num());
-            const auto start = id * elementsPerThread;
-            const auto stop = id == numThreads - 1 ? flatSize : (id + 1) * elementsPerThread;
-
-            auto copyLBlock = [&](const std::byte*& threadSrc, std::size_t dstIndex, auto coord) LLAMA_LAMBDA_INLINE
-            {
-                constexpr auto bytes = L * sizeof(llama::GetType<RecordDim, decltype(coord)>);
-                std::memcpy(mapDst(dstIndex, coord), threadSrc, bytes);
-                threadSrc += bytes;
-            };
-            if constexpr (srcIsAoSoA)
-            {
-                auto* threadSrc = mapSrc(start, llama::RecordCoord<>{});
-                for (std::size_t i = start; i < stop; i += LanesSrc)
-                    llama::forEachLeaf<RecordDim>(
-                        [&](auto coord) LLAMA_LAMBDA_INLINE
-                        {
-                            for (std::size_t j = 0; j < LanesSrc; j += L)
-                                copyLBlock(threadSrc, i + j, coord);
-                        });
-            }
-            else
-            {
-                llama::forEachLeaf<RecordDim>(
-                    [&](auto coord) LLAMA_LAMBDA_INLINE
-                    {
-                        auto* threadSrc = mapSrc(start, coord);
-                        for (std::size_t i = start; i < stop; i += L)
-                            copyLBlock(threadSrc, i, coord);
-                    });
-            }
-        }
-    }
-    else
-    {
-        // optimized for linear writing
-        const auto elementsPerThread
-            = dstIsAoSoA ? ((flatSize / LanesDst) / numThreads) * LanesDst : flatSize / L / numThreads * L;
-#pragma omp parallel num_threads(numThreads)
-        {
-            const auto id = static_cast<std::size_t>(omp_get_thread_num());
-            const auto start = id * elementsPerThread;
-            const auto stop = id == numThreads - 1 ? flatSize : (id + 1) * elementsPerThread;
-
-            auto copyLBlock = [&](std::byte*& threadDst, std::size_t srcIndex, auto coord) LLAMA_LAMBDA_INLINE
-            {
-                constexpr auto bytes = L * sizeof(llama::GetType<RecordDim, decltype(coord)>);
-                std::memcpy(threadDst, mapSrc(srcIndex, coord), bytes);
-                threadDst += bytes;
-            };
-            if constexpr (dstIsAoSoA)
-            {
-                auto* threadDst = mapDst(start, llama::RecordCoord<>{});
-                for (std::size_t i = start; i < stop; i += LanesDst)
-                    llama::forEachLeaf<RecordDim>(
-                        [&](auto coord) LLAMA_LAMBDA_INLINE
-                        {
-                            for (std::size_t j = 0; j < LanesDst; j += L)
-                                copyLBlock(threadDst, i + j, coord);
-                        });
-            }
-            else
-            {
-                llama::forEachLeaf<RecordDim>(
-                    [&](auto coord) LLAMA_LAMBDA_INLINE
-                    {
-                        auto* threadDst = mapDst(start, coord);
-                        for (std::size_t i = start; i < stop; i += L)
-                            copyLBlock(threadDst, i, coord);
-                    });
-            }
-        }
-    }
-}
-
-template <
-    bool ReadOpt,
-    typename ArrayDims,
-    typename RecordDim,
-    std::size_t LanesSrc,
-    typename SrcBlobType,
-    std::size_t LanesDst,
-    typename DstBlobType>
-void aosoa_copy(
-    const llama::View<
-        llama::mapping::AoSoA<ArrayDims, RecordDim, LanesSrc, llama::mapping::LinearizeArrayDimsCpp>,
-        SrcBlobType>& srcView,
-    llama::View<
-        llama::mapping::AoSoA<ArrayDims, RecordDim, LanesDst, llama::mapping::LinearizeArrayDimsCpp>,
-        DstBlobType>& dstView,
-    std::size_t numThreads = 1)
-{
-    aosoa_copy_internal<ReadOpt, ArrayDims, RecordDim, LanesSrc, LanesDst, false, false>(srcView, dstView, numThreads);
-}
-
-template <
-    bool ReadOpt,
-    typename ArrayDims,
-    typename RecordDim,
-    std::size_t LanesSrc,
-    typename SrcBlobType,
-    bool DstSeparateBuffers,
-    typename DstBlobType>
-void aosoa_copy(
-    const llama::View<
-        llama::mapping::AoSoA<ArrayDims, RecordDim, LanesSrc, llama::mapping::LinearizeArrayDimsCpp>,
-        SrcBlobType>& srcView,
-    llama::View<
-        llama::mapping::SoA<ArrayDims, RecordDim, DstSeparateBuffers, llama::mapping::LinearizeArrayDimsCpp>,
-        DstBlobType>& dstView,
-    std::size_t numThreads = 1)
-{
-    aosoa_copy_internal<
-        ReadOpt,
-        ArrayDims,
-        RecordDim,
-        LanesSrc,
-        std::numeric_limits<std::size_t>::max(),
-        false,
-        DstSeparateBuffers>(srcView, dstView, numThreads);
-}
-
-template <
-    bool ReadOpt,
-    typename ArrayDims,
-    typename RecordDim,
-    bool SrcSeparateBuffers,
-    typename SrcBlobType,
-    std::size_t LanesDst,
-    typename DstBlobType>
-void aosoa_copy(
-    const llama::View<
-        llama::mapping::SoA<ArrayDims, RecordDim, SrcSeparateBuffers, llama::mapping::LinearizeArrayDimsCpp>,
-        SrcBlobType>& srcView,
-    llama::View<
-        llama::mapping::AoSoA<ArrayDims, RecordDim, LanesDst, llama::mapping::LinearizeArrayDimsCpp>,
-        DstBlobType>& dstView,
-    std::size_t numThreads = 1)
-{
-    aosoa_copy_internal<
-        ReadOpt,
-        ArrayDims,
-        RecordDim,
-        std::numeric_limits<std::size_t>::max(),
-        LanesDst,
-        SrcSeparateBuffers,
-        false>(srcView, dstView, numThreads);
-}
-
-
 template <typename Mapping, typename BlobType>
 auto hash(const llama::View<Mapping, BlobType>& view)
 {
@@ -472,11 +191,19 @@ set ylabel "throughput [GiB/s]"
 #endif
     benchmarkMemcpy(
         "memcpy(p)",
-        [&](auto* dst, auto* src, auto size) { parallel_memcpy(dst, src, size, std::memcpy, numThreads); });
+        [&](auto* dst, auto* src, auto size)
+        {
+#pragma omp parallel
+            llama::internal::parallel_memcpy(dst, src, size, omp_get_thread_num(), omp_get_num_threads(), std::memcpy);
+        });
 #ifdef __AVX2__
     benchmarkMemcpy(
         "memcpy_avx2(p)",
-        [&](auto* dst, auto* src, auto size) { parallel_memcpy(dst, src, size, memcpy_avx2, numThreads); });
+        [&](auto* dst, auto* src, auto size)
+        {
+#    pragma omp parallel
+            llama::internal::parallel_memcpy(dst, src, size, omp_get_thread_num(), omp_get_num_threads(), memcpy_avx2);
+        });
 #else
     plotFile << "0\t";
 #endif
@@ -513,17 +240,17 @@ set ylabel "throughput [GiB/s]"
         plotFile << "0\t";
         plotFile << "0\t";
         plotFile << "0\t";
-        benchmarkCopy("naive copy", [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); });
+        benchmarkCopy("naive copy", [](const auto& srcView, auto& dstView) { llama::fieldWiseCopy(srcView, dstView); });
         benchmarkCopy("std::copy", [](const auto& srcView, auto& dstView) { std_copy(srcView, dstView); });
         constexpr auto oneIsAoSoA = is_AoSoA<decltype(srcMapping)> || is_AoSoA<decltype(dstMapping)>;
         if constexpr (oneIsAoSoA)
         {
             benchmarkCopy(
                 "aosoa copy(r)",
-                [](const auto& srcView, auto& dstView) { aosoa_copy<true>(srcView, dstView); });
+                [](const auto& srcView, auto& dstView) { llama::aosoaCommonBlockCopy(srcView, dstView, true); });
             benchmarkCopy(
                 "aosoa copy(w)",
-                [](const auto& srcView, auto& dstView) { aosoa_copy<false>(srcView, dstView); });
+                [](const auto& srcView, auto& dstView) { llama::aosoaCommonBlockCopy(srcView, dstView, false); });
         }
         else
         {
@@ -532,15 +259,27 @@ set ylabel "throughput [GiB/s]"
         }
         benchmarkCopy(
             "naive copy(p)",
-            [&](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView, numThreads); });
+            [&](const auto& srcView, auto& dstView)
+            {
+#pragma omp parallel
+                llama::fieldWiseCopy(srcView, dstView, omp_get_thread_num(), omp_get_num_threads());
+            });
         if constexpr (oneIsAoSoA)
         {
             benchmarkCopy(
                 "aosoa_copy(r,p)",
-                [&](const auto& srcView, auto& dstView) { aosoa_copy<true>(srcView, dstView, numThreads); });
+                [&](const auto& srcView, auto& dstView)
+                {
+#pragma omp parallel
+                    llama::aosoaCommonBlockCopy(srcView, dstView, true, omp_get_thread_num(), omp_get_num_threads());
+                });
             benchmarkCopy(
                 "aosoa_copy(w,p)",
-                [&](const auto& srcView, auto& dstView) { aosoa_copy<false>(srcView, dstView, numThreads); });
+                [&](const auto& srcView, auto& dstView)
+                {
+#pragma omp parallel
+                    llama::aosoaCommonBlockCopy(srcView, dstView, false, omp_get_thread_num(), omp_get_num_threads());
+                });
         }
         else
         {
diff --git a/include/llama/Copy.hpp b/include/llama/Copy.hpp
new file mode 100644
index 0000000000..4075fb4baf
--- /dev/null
+++ b/include/llama/Copy.hpp
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include "View.hpp"
+#include "mapping/AoSoA.hpp"
+#include "mapping/SoA.hpp"
+
+#include <cstring>
+#include <numeric>
+
+namespace llama
+{
+    namespace internal
+    {
+        inline void parallel_memcpy(
+            std::byte* dst,
+            const std::byte* src,
+            std::size_t size,
+            std::size_t threadId = 0,
+            std::size_t threadCount = 1,
+            decltype(std::memcpy) memcpy = std::memcpy)
+        {
+            const auto sizePerThread = size / threadCount;
+            const auto sizeLastThread = sizePerThread + size % threadCount;
+            const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread;
+            memcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread);
+        }
+    } // namespace internal
+
+    /// @brief Direct memcpy from source view blobs to destination view blobs. Both views need to have the same mappings
+    /// with the same array dimensions.
+    /// @param threadId Optional. Thread id in case of multi-threaded copy.
+    /// @param threadCount Optional. Thread count in case of multi-threaded copy.
+    template <typename Mapping, typename SrcBlob, typename DstBlob>
+    void blobMemcpy(
+        const View<Mapping, SrcBlob>& srcView,
+        View<Mapping, DstBlob>& dstView,
+        std::size_t threadId = 0,
+        std::size_t threadCount = 1)
+    {
+        // TODO: we do not verify if the mappings have other runtime state than the array dimensions
+        if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims())
+            throw std::runtime_error{"Array dimensions sizes are different"};
+
+        // TODO: this is maybe not the best parallel copying strategy
+        for (std::size_t i = 0; i < Mapping::blobCount; i++)
+            internal::parallel_memcpy(
+                &dstView.storageBlobs[i][0],
+                &srcView.storageBlobs[i][0],
+                dstView.mapping.blobSize(i),
+                threadId,
+                threadCount);
+    }
+
+    /// @brief Field-wise copy from source to destination view. Both views need to have the same array and record
+    /// dimensions.
+    /// @param threadId Optional. Thread id in case of multi-threaded copy.
+    /// @param threadCount Optional. Thread count in case of multi-threaded copy.
+    template <typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
+    void fieldWiseCopy(
+        const View<SrcMapping, SrcBlob>& srcView,
+        View<DstMapping, DstBlob>& dstView,
+        std::size_t threadId = 0,
+        std::size_t threadCount = 1)
+    {
+        // TODO: think if we can remove this restriction
+        static_assert(std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>);
+
+        if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims())
+            throw std::runtime_error{"Array dimensions sizes are different"};
+
+        auto copyOne = [&](auto ad) LLAMA_LAMBDA_INLINE
+        {
+            forEachLeaf<typename DstMapping::RecordDim>([&](auto coord) LLAMA_LAMBDA_INLINE
+                                                        { dstView(ad)(coord) = srcView(ad)(coord); });
+        };
+
+        constexpr auto dims = std::decay_t<decltype(srcView.mapping.arrayDims())>::rank;
+        const auto& adSize = srcView.mapping.arrayDims();
+        const auto workPerThread = (adSize[0] + threadCount - 1) / threadCount;
+        const auto start = threadId * workPerThread;
+        const auto end = std::min((threadId + 1) * workPerThread, adSize[0]);
+        for (auto i = threadId * workPerThread; i < end; i++)
+        {
+            if constexpr (dims > 1)
+                forEachADCoord(internal::popFront(adSize), copyOne, static_cast<std::size_t>(i));
+            else
+                copyOne(ArrayDims<dims>{static_cast<std::size_t>(i)});
+        }
+    }
+
+    namespace internal
+    {
+        template <typename Mapping>
+        inline constexpr std::size_t aosoaLanes = 0;
+
+        template <typename ArrayDims, typename RecordDim, bool SeparateBuffers, typename LinearizeArrayDimsFunctor>
+        inline constexpr std::size_t aosoaLanes<
+            mapping::SoA<ArrayDims, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>> = std::
+            numeric_limits<std::size_t>::max();
+
+        template <typename ArrayDims, typename RecordDim, std::size_t Lanes, typename LinearizeArrayDimsFunctor>
+        inline constexpr std::size_t
+            aosoaLanes<mapping::AoSoA<ArrayDims, RecordDim, Lanes, LinearizeArrayDimsFunctor>> = Lanes;
+    } // namespace internal
+
+    template <typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
+    void aosoaCommonBlockCopy(
+        const View<SrcMapping, SrcBlob>& srcView,
+        View<DstMapping, DstBlob>& dstView,
+        bool readOpt,
+        std::size_t threadId = 0,
+        std::size_t threadCount = 1)
+    {
+        // TODO: think if we can remove this restriction
+        static_assert(std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>);
+        static_assert(
+            std::is_same_v<
+                typename SrcMapping::LinearizeArrayDimsFunctor,
+                typename DstMapping::LinearizeArrayDimsFunctor>,
+            "Source and destination mapping need to use the same array dimensions linearizer");
+        using RecordDim = typename SrcMapping::RecordDim;
+
+        constexpr bool MBSrc = SrcMapping::blobCount > 1;
+        constexpr bool MBDst = DstMapping::blobCount > 1;
+        constexpr auto LanesSrc = internal::aosoaLanes<SrcMapping>;
+        constexpr auto LanesDst = internal::aosoaLanes<DstMapping>;
+
+        if (srcView.mapping.arrayDims() != dstView.mapping.arrayDims())
+            throw std::runtime_error{"Array dimensions sizes are different"};
+
+        constexpr auto srcIsAoSoA = LanesSrc != std::numeric_limits<std::size_t>::max();
+        constexpr auto dstIsAoSoA = LanesDst != std::numeric_limits<std::size_t>::max();
+
+        static_assert(srcIsAoSoA || dstIsAoSoA, "At least one of the mappings must be an AoSoA mapping");
+        static_assert(!srcIsAoSoA || decltype(srcView.storageBlobs)::rank == 1);
+        static_assert(!dstIsAoSoA || decltype(dstView.storageBlobs)::rank == 1);
+
+        const auto arrayDims = dstView.mapping.arrayDims();
+        const auto flatSize
+            = std::reduce(std::begin(arrayDims), std::end(arrayDims), std::size_t{1}, std::multiplies<>{});
+
+        // the same as AoSoA::blobNrAndOffset but takes a flat array index
+        auto mapAoSoA = [](std::size_t flatArrayIndex, auto coord, std::size_t Lanes) LLAMA_LAMBDA_INLINE
+        {
+            const auto blockIndex = flatArrayIndex / Lanes;
+            const auto laneIndex = flatArrayIndex % Lanes;
+            const auto offset = (sizeOf<RecordDim> * Lanes) * blockIndex + offsetOf<RecordDim, decltype(coord)> * Lanes
+                + sizeof(GetType<RecordDim, decltype(coord)>) * laneIndex;
+            return offset;
+        };
+        // the same as SoA::blobNrAndOffset but takes a flat array index
+        auto mapSoA = [&](std::size_t flatArrayIndex, auto coord, bool mb) LLAMA_LAMBDA_INLINE
+        {
+            const auto blob = mb * flatRecordCoord<RecordDim, decltype(coord)>;
+            const auto offset = !mb * offsetOf<RecordDim, decltype(coord)> * flatSize
+                + sizeof(GetType<RecordDim, decltype(coord)>) * flatArrayIndex;
+            return NrAndOffset{blob, offset};
+        };
+
+        auto mapSrc = [&srcView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE
+        {
+            if constexpr (srcIsAoSoA)
+                return &srcView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesSrc);
+            else
+            {
+                const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBSrc);
+                return &srcView.storageBlobs[blob][off];
+            }
+        };
+        auto mapDst = [&dstView, &mapAoSoA, &mapSoA](std::size_t flatArrayIndex, auto coord) LLAMA_LAMBDA_INLINE
+        {
+            if constexpr (dstIsAoSoA)
+                return &dstView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, coord, LanesDst);
+            else
+            {
+                const auto [blob, off] = mapSoA(flatArrayIndex, coord, MBDst);
+                return &dstView.storageBlobs[blob][off];
+            }
+        };
+
+        constexpr auto L = std::min(LanesSrc, LanesDst);
+        static_assert(!srcIsAoSoA || LanesSrc % L == 0);
+        static_assert(!dstIsAoSoA || LanesDst % L == 0);
+        if (readOpt)
+        {
+            // optimized for linear reading
+            const auto elementsPerThread
+                = srcIsAoSoA ? flatSize / LanesSrc / threadCount * LanesSrc : flatSize / L / threadCount * L;
+            {
+                const auto start = threadId * elementsPerThread;
+                const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;
+
+                auto copyLBlock = [&](const std::byte*& threadSrc, std::size_t dstIndex, auto coord) LLAMA_LAMBDA_INLINE
+                {
+                    constexpr auto bytes = L * sizeof(GetType<RecordDim, decltype(coord)>);
+                    std::memcpy(mapDst(dstIndex, coord), threadSrc, bytes);
+                    threadSrc += bytes;
+                };
+                if constexpr (srcIsAoSoA)
+                {
+                    auto* threadSrc = mapSrc(start, RecordCoord<>{});
+                    for (std::size_t i = start; i < stop; i += LanesSrc)
+                        forEachLeaf<RecordDim>(
+                            [&](auto coord) LLAMA_LAMBDA_INLINE
+                            {
+                                for (std::size_t j = 0; j < LanesSrc; j += L)
+                                    copyLBlock(threadSrc, i + j, coord);
+                            });
+                }
+                else
+                {
+                    forEachLeaf<RecordDim>(
+                        [&](auto coord) LLAMA_LAMBDA_INLINE
+                        {
+                            auto* threadSrc = mapSrc(start, coord);
+                            for (std::size_t i = start; i < stop; i += L)
+                                copyLBlock(threadSrc, i, coord);
+                        });
+                }
+            }
+        }
+        else
+        {
+            // optimized for linear writing
+            const auto elementsPerThread
+                = dstIsAoSoA ? ((flatSize / LanesDst) / threadCount) * LanesDst : flatSize / L / threadCount * L;
+            {
+                const auto start = threadId * elementsPerThread;
+                const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;
+
+                auto copyLBlock = [&](std::byte*& threadDst, std::size_t srcIndex, auto coord) LLAMA_LAMBDA_INLINE
+                {
+                    constexpr auto bytes = L * sizeof(GetType<RecordDim, decltype(coord)>);
+                    std::memcpy(threadDst, mapSrc(srcIndex, coord), bytes);
+                    threadDst += bytes;
+                };
+                if constexpr (dstIsAoSoA)
+                {
+                    auto* threadDst = mapDst(start, RecordCoord<>{});
+                    for (std::size_t i = start; i < stop; i += LanesDst)
+                        forEachLeaf<RecordDim>(
+                            [&](auto coord) LLAMA_LAMBDA_INLINE
+                            {
+                                for (std::size_t j = 0; j < LanesDst; j += L)
+                                    copyLBlock(threadDst, i + j, coord);
+                            });
+                }
+                else
+                {
+                    forEachLeaf<RecordDim>(
+                        [&](auto coord) LLAMA_LAMBDA_INLINE
+                        {
+                            auto* threadDst = mapDst(start, coord);
+                            for (std::size_t i = start; i < stop; i += L)
+                                copyLBlock(threadDst, i, coord);
+                        });
+                }
+            }
+        }
+    }
+
+    /// @brief Generic implementation of \ref copy. LLAMA provides several specializations of this construct for
+    /// specific mappings. Users are encourages to also specialize this template with better copy algorithms.
+    template <typename SrcMapping, typename DstMapping, typename SFINAE = void>
+    struct Copy
+    {
+        template <typename SrcView, typename DstView>
+        void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
+        {
+            fieldWiseCopy(srcView, dstView, threadId, threadCount);
+        }
+    };
+
+    template <typename Mapping>
+    struct Copy<Mapping, Mapping>
+    {
+        template <typename SrcView, typename DstView>
+        void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
+        {
+            blobMemcpy(srcView, dstView, threadId, threadCount);
+        }
+    };
+
+    template <
+        typename ArrayDims,
+        typename RecordDim,
+        typename LinearizeArrayDims,
+        std::size_t LanesSrc,
+        std::size_t LanesDst>
+    struct Copy<
+        mapping::AoSoA<ArrayDims, RecordDim, LanesSrc, LinearizeArrayDims>,
+        mapping::AoSoA<ArrayDims, RecordDim, LanesDst, LinearizeArrayDims>,
+        std::enable_if_t<LanesSrc != LanesDst>>
+    {
+        template <typename SrcBlob, typename DstBlob>
+        void operator()(
+            const View<mapping::AoSoA<ArrayDims, RecordDim, LanesSrc, LinearizeArrayDims>, SrcBlob>& srcView,
+            View<mapping::AoSoA<ArrayDims, RecordDim, LanesDst, LinearizeArrayDims>, DstBlob>& dstView,
+            std::size_t threadId,
+            std::size_t threadCount)
+        {
+            constexpr auto readOpt = true; // TODO: how to choose?
+            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
+        }
+    };
+
+    template <
+        typename ArrayDims,
+        typename RecordDim,
+        typename LinearizeArrayDims,
+        std::size_t LanesSrc,
+        bool DstSeparateBuffers>
+    struct Copy<
+        mapping::AoSoA<ArrayDims, RecordDim, LanesSrc, LinearizeArrayDims>,
+        mapping::SoA<ArrayDims, RecordDim, DstSeparateBuffers, LinearizeArrayDims>>
+    {
+        template <typename SrcBlob, typename DstBlob>
+        void operator()(
+            const View<mapping::AoSoA<ArrayDims, RecordDim, LanesSrc, LinearizeArrayDims>, SrcBlob>& srcView,
+            View<mapping::SoA<ArrayDims, RecordDim, DstSeparateBuffers, LinearizeArrayDims>, DstBlob>& dstView,
+            std::size_t threadId,
+            std::size_t threadCount)
+        {
+            constexpr auto readOpt = true; // TODO: how to choose?
+            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
+        }
+    };
+
+    template <
+        typename ArrayDims,
+        typename RecordDim,
+        typename LinearizeArrayDims,
+        std::size_t LanesDst,
+        bool SrcSeparateBuffers>
+    struct Copy<
+        mapping::SoA<ArrayDims, RecordDim, SrcSeparateBuffers, LinearizeArrayDims>,
+        mapping::AoSoA<ArrayDims, RecordDim, LanesDst, LinearizeArrayDims>>
+    {
+        template <typename SrcBlob, typename DstBlob>
+        void operator()(
+            const View<mapping::SoA<ArrayDims, RecordDim, SrcSeparateBuffers, LinearizeArrayDims>, SrcBlob>& srcView,
+            View<mapping::AoSoA<ArrayDims, RecordDim, LanesDst, LinearizeArrayDims>, DstBlob>& dstView,
+            std::size_t threadId,
+            std::size_t threadCount)
+        {
+            constexpr auto readOpt = true; // TODO: how to choose?
+            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
+        }
+    };
+
+    /// Copy data from source view to destination view. Both views need to have the same array and record
+    /// dimensions. Delegates to \ref Copy to choose an implementation.
+    template <typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
+    void copy(
+        const View<SrcMapping, SrcBlob>& srcView,
+        View<DstMapping, DstBlob>& dstView,
+        std::size_t threadId = 0,
+        std::size_t threadCount = 1)
+    {
+        Copy<SrcMapping, DstMapping>{}(srcView, dstView, threadId, threadCount);
+    }
+} // namespace llama
diff --git a/include/llama/llama.hpp b/include/llama/llama.hpp
index c552ac7170..fa135dc765 100644
--- a/include/llama/llama.hpp
+++ b/include/llama/llama.hpp
@@ -33,6 +33,7 @@
 
 #include "ArrayDimsIndexRange.hpp"
 #include "BlobAllocators.hpp"
+#include "Copy.hpp"
 #include "Core.hpp"
 #include "Vector.hpp"
 #include "View.hpp"
diff --git a/include/llama/mapping/AoS.hpp b/include/llama/mapping/AoS.hpp
index 71c0300913..7b794f5374 100644
--- a/include/llama/mapping/AoS.hpp
+++ b/include/llama/mapping/AoS.hpp
@@ -10,17 +10,18 @@ namespace llama::mapping
     /// Array of struct mapping. Used to create a \ref View via \ref allocView.
     /// \tparam AlignAndPad If true, padding bytes are inserted to guarantee that struct members are properly aligned.
     /// If false, struct members are tighly packed.
-    /// \tparam LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
+    /// \tparam T_LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
     /// how big the linear domain gets.
     template <
         typename T_ArrayDims,
         typename T_RecordDim,
         bool AlignAndPad = true,
-        typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
+        typename T_LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
     struct AoS
     {
         using ArrayDims = T_ArrayDims;
         using RecordDim = T_RecordDim;
+        using LinearizeArrayDimsFunctor = T_LinearizeArrayDimsFunctor;
         static constexpr std::size_t blobCount = 1;
 
         constexpr AoS() = default;
diff --git a/include/llama/mapping/AoSoA.hpp b/include/llama/mapping/AoSoA.hpp
index 1f29f0f7e8..d016fab549 100644
--- a/include/llama/mapping/AoSoA.hpp
+++ b/include/llama/mapping/AoSoA.hpp
@@ -26,17 +26,18 @@ namespace llama::mapping
 
     /// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView.
     /// \tparam Lanes The size of the inner arrays of this array of struct of arrays.
-    /// \tparam LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
+    /// \tparam T_LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
     /// how big the linear domain gets.
     template <
         typename T_ArrayDims,
         typename T_RecordDim,
         std::size_t Lanes,
-        typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
+        typename T_LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
     struct AoSoA
     {
         using ArrayDims = T_ArrayDims;
         using RecordDim = T_RecordDim;
+        using LinearizeArrayDimsFunctor = T_LinearizeArrayDimsFunctor;
         static constexpr std::size_t blobCount = 1;
 
         constexpr AoSoA() = default;
diff --git a/include/llama/mapping/SoA.hpp b/include/llama/mapping/SoA.hpp
index 640a7c3262..dc41b593e6 100644
--- a/include/llama/mapping/SoA.hpp
+++ b/include/llama/mapping/SoA.hpp
@@ -17,11 +17,12 @@ namespace llama::mapping
         typename T_ArrayDims,
         typename T_RecordDim,
         bool SeparateBuffers = true,
-        typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
+        typename T_LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
     struct SoA
     {
         using ArrayDims = T_ArrayDims;
         using RecordDim = T_RecordDim;
+        using LinearizeArrayDimsFunctor = T_LinearizeArrayDimsFunctor;
         static constexpr std::size_t blobCount
             = SeparateBuffers ? boost::mp11::mp_size<FlatRecordDim<RecordDim>>::value : 1;
 
diff --git a/tests/copy.cpp b/tests/copy.cpp
new file mode 100644
index 0000000000..a766dd0a33
--- /dev/null
+++ b/tests/copy.cpp
@@ -0,0 +1,113 @@
+#include "common.h"
+
+#include <catch2/catch.hpp>
+#include <llama/llama.hpp>
+
+namespace
+{
+    using ArrayDims = llama::ArrayDims<2>;
+    using RecordDim = Vec3I;
+
+    template <typename SrcMapping, typename DstMapping, typename CopyFunc>
+    void testCopy(CopyFunc copy)
+    {
+        const auto viewSize = ArrayDims{4, 8};
+        const auto srcMapping = SrcMapping(viewSize);
+        auto srcView = llama::allocView(srcMapping);
+        auto value = std::size_t{0};
+        for (auto ad : llama::ArrayDimsIndexRange{srcMapping.arrayDims()})
+            llama::forEachLeaf<RecordDim>(
+                [&](auto coord)
+                {
+                    srcView(ad)(coord) = value;
+                    value++;
+                });
+
+        auto dstView = llama::allocView(DstMapping(viewSize));
+        copy(srcView, dstView);
+
+        value = 0;
+        for (auto ad : llama::ArrayDimsIndexRange{srcMapping.arrayDims()})
+            llama::forEachLeaf<RecordDim>(
+                [&](auto coord)
+                {
+                    CHECK(dstView(ad)(coord) == value);
+                    value++;
+                });
+    }
+
+    template <typename Mapping>
+    inline constexpr bool isSoA = false;
+
+    template <typename ArrayDims, typename RecordDim, bool SeparateBuffers, typename LinearizeArrayDimsFunctor>
+    inline constexpr bool
+        isSoA<llama::mapping::SoA<ArrayDims, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>> = true;
+
+    // Do not test all combinations as this exlodes the unit test compile and runtime.
+    using AoSMappings = boost::mp11::mp_list<
+        llama::mapping::AoS<ArrayDims, RecordDim, false, llama::mapping::LinearizeArrayDimsCpp>,
+        // llama::mapping::AoS<ArrayDims, RecordDim, false, llama::mapping::LinearizeArrayDimsFortran>,
+        // llama::mapping::AoS<ArrayDims, RecordDim, true, llama::mapping::LinearizeArrayDimsCpp>,
+        llama::mapping::AoS<ArrayDims, RecordDim, true, llama::mapping::LinearizeArrayDimsFortran>>;
+
+    using OtherMappings = boost::mp11::mp_list<
+        llama::mapping::SoA<ArrayDims, RecordDim, false, llama::mapping::LinearizeArrayDimsCpp>,
+        // llama::mapping::SoA<ArrayDims, RecordDim, false, llama::mapping::LinearizeArrayDimsFortran>,
+        // llama::mapping::SoA<ArrayDims, RecordDim, true, llama::mapping::LinearizeArrayDimsCpp>,
+        llama::mapping::SoA<ArrayDims, RecordDim, true, llama::mapping::LinearizeArrayDimsFortran>,
+        llama::mapping::AoSoA<ArrayDims, RecordDim, 4, llama::mapping::LinearizeArrayDimsCpp>,
+        // llama::mapping::AoSoA<ArrayDims, RecordDim, 4, llama::mapping::LinearizeArrayDimsFortran>,
+        // llama::mapping::AoSoA<ArrayDims, RecordDim, 8, llama::mapping::LinearizeArrayDimsCpp>,
+        llama::mapping::AoSoA<ArrayDims, RecordDim, 8, llama::mapping::LinearizeArrayDimsFortran>>;
+
+    using AllMappings = boost::mp11::mp_append<AoSMappings, OtherMappings>;
+
+    using AllMappingsProduct = boost::mp11::mp_product<boost::mp11::mp_list, AllMappings, AllMappings>;
+
+    template <typename List>
+    using BothAreSoAOrHaveDifferentLinearizer = std::bool_constant<
+        (isSoA<boost::mp11::mp_first<List>> && isSoA<boost::mp11::mp_second<List>>)
+        || !std::is_same_v<
+            typename boost::mp11::mp_first<List>::LinearizeArrayDimsFunctor,
+            typename boost::mp11::mp_second<List>::LinearizeArrayDimsFunctor>>;
+
+    using AoSoAMappingsProduct = boost::mp11::mp_remove_if<
+        boost::mp11::mp_product<boost::mp11::mp_list, OtherMappings, OtherMappings>,
+        BothAreSoAOrHaveDifferentLinearizer>;
+} // namespace
+
+TEMPLATE_LIST_TEST_CASE("copy", "", AllMappingsProduct)
+{
+    using SrcMapping = boost::mp11::mp_first<TestType>;
+    using DstMapping = boost::mp11::mp_second<TestType>;
+    testCopy<SrcMapping, DstMapping>([](const auto& srcView, auto& dstView) { llama::copy(srcView, dstView); });
+}
+
+TEMPLATE_LIST_TEST_CASE("blobMemcpy", "", AllMappings)
+{
+    testCopy<TestType, TestType>([](const auto& srcView, auto& dstView) { llama::blobMemcpy(srcView, dstView); });
+}
+
+TEMPLATE_LIST_TEST_CASE("fieldWiseCopy", "", AllMappingsProduct)
+{
+    using SrcMapping = boost::mp11::mp_first<TestType>;
+    using DstMapping = boost::mp11::mp_second<TestType>;
+    testCopy<SrcMapping, DstMapping>([](const auto& srcView, auto& dstView)
+                                     { llama::fieldWiseCopy(srcView, dstView); });
+}
+
+TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy.readOpt", "", AoSoAMappingsProduct)
+{
+    using SrcMapping = boost::mp11::mp_first<TestType>;
+    using DstMapping = boost::mp11::mp_second<TestType>;
+    testCopy<SrcMapping, DstMapping>([](const auto& srcView, auto& dstView)
+                                     { llama::aosoaCommonBlockCopy(srcView, dstView, true); });
+}
+
+TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy.writeOpt", "", AoSoAMappingsProduct)
+{
+    using SrcMapping = boost::mp11::mp_first<TestType>;
+    using DstMapping = boost::mp11::mp_second<TestType>;
+    testCopy<SrcMapping, DstMapping>([](const auto& srcView, auto& dstView)
+                                     { llama::aosoaCommonBlockCopy(srcView, dstView, false); });
+}