diff --git a/.clang-tidy b/.clang-tidy
index 1c9d01d197..fb4d517538 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -50,7 +50,8 @@ Checks: >
     -misc-no-recursion,
     -llvm-header-guard,
     -cppcoreguidelines-macro-usage,
-    -fuchsia-statically-constructed-objects
+    -fuchsia-statically-constructed-objects,
+    -cppcoreguidelines-pro-type-union-access
 
 WarningsAsErrors: '*'
 HeaderFilterRegex: ''
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index fb6022920a..be55db938c 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -72,9 +72,16 @@ jobs:
     - uses: actions/checkout@v2
       with:
         fetch-depth: 1
-    - name: apt install boost and lcov
+    - name: install boost
+      run: |
+          BOOST_ARCHIVE=boost_${BOOST_VERSION//./_}.tar.bz2
+          wget -q https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION/source/$BOOST_ARCHIVE
+          tar -xf $BOOST_ARCHIVE
+          rm $BOOST_ARCHIVE
+          mv boost_${BOOST_VERSION//./_} "${BOOST_ROOT}"
+    - name: install lcov
       run: |
-        sudo apt install libboost-all-dev lcov
+        sudo apt install lcov
     - name: vcpkg install dependencies
       run: |
         vcpkg install catch2 fmt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 96697fbccb..d975590084 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(${PROJECT_NAME} INTERFACE)
 target_include_directories(${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} INTERFACE Boost::headers)
+add_compile_definitions(BOOST_ATOMIC_NO_LIB) # we don't need the compiled part in LLAMA or its examples
 if (fmt_FOUND)
 	target_link_libraries(${PROJECT_NAME} INTERFACE fmt::fmt)
 else()
diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
index ddcd8f1eb0..8e11ee4aa1 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -16,9 +16,15 @@ endif()
 set(llama_INCLUDE_DIR ${llama_INCLUDE_DIR} "${PACKAGE_PREFIX_DIR}/include")
 target_include_directories(llama INTERFACE ${llama_INCLUDE_DIR})
 
-# Boost
+# dependencies
 find_package(Boost 1.70.0 REQUIRED)
+find_package(fmt CONFIG QUIET)
 target_link_libraries(llama INTERFACE Boost::headers)
+if (fmt_FOUND)
+	target_link_libraries(${PROJECT_NAME} INTERFACE fmt::fmt)
+else()
+	message(WARNING "The fmt library was not found. You cannot use llama's dumping facilities.")
+endif()
 
 
 INCLUDE(FindPackageHandleStandardArgs)
diff --git a/docs/pages/mappings.rst b/docs/pages/mappings.rst
index bc8af723ef..b1048cfae7 100644
--- a/docs/pages/mappings.rst
+++ b/docs/pages/mappings.rst
@@ -206,17 +206,27 @@ Trace
 -----
 
 The Trace mapping is a meta mapping that wraps over an inner mapping and counts all accesses made to the fields of the record dimension.
-A report is printed to the stdout when requested or the mapping instance is destroyed.
+A report is printed to the stdout when requested.
+The mapping adds an additional blob to the blobs of the inner mapping used as storage for the access counts.
 
 .. code-block:: C++
 
-    {
-        auto anyMapping = ...;
-        llama::mapping::Trace mapping{anyMapping};
-        ...
-        mapping.print(); // print report explicitly
-    } // report is printed implicitly
+    auto anyMapping = ...;
+    llama::mapping::Trace mapping{anyMapping};
+    ...
+    mapping.printFieldHits(); // print report with read and writes to each field
 
+The Trace mapping uses proxy references to instrument reads and writes.
+If this is problematic, Trace can also be configured to return raw C++ references.
+In that case, only the number of memory location computations can be traced,
+but not how often the program reads/writes to those locations.
+
+.. code-block:: C++
+
+    auto anyMapping = ...;
+    llama::mapping::Trace<decltype(anyMapping), false> mapping{anyMapping};
+    ...
+    mapping.printFieldHits(); // print report with number of computed memory locations
 
 Null
 ----
diff --git a/examples/cuda/nbody/nbody.cu b/examples/cuda/nbody/nbody.cu
index edf0452b73..951aaefaf5 100644
--- a/examples/cuda/nbody/nbody.cu
+++ b/examples/cuda/nbody/nbody.cu
@@ -17,6 +17,7 @@ constexpr auto SHARED_ELEMENTS_PER_BLOCK = 512;
 constexpr auto STEPS = 5; ///< number of steps to calculate
 constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate
 constexpr auto RUN_UPATE = true; // run update step. Useful to disable for benchmarking the move step.
+constexpr auto TRACE = true;
 constexpr FP TIMESTEP = 0.0001f;
 
 constexpr auto THREADS_PER_BLOCK = 256;
@@ -28,6 +29,12 @@ static_assert(SHARED_ELEMENTS_PER_BLOCK % THREADS_PER_BLOCK == 0);
 
 constexpr FP EPS2 = 0.01;
 
+#if __CUDA_ARCH__ >= 600
+using CountType = unsigned long long int;
+#else
+using CountType = unsigned;
+#endif
+
 using namespace std::literals;
 
 // clang-format off
@@ -205,12 +212,19 @@ try
                 llama::mapping::BindSoA<>::fn,
                 true>{extents};
     }();
+    auto tmapping = [&]
+    {
+        if constexpr(TRACE)
+            return llama::mapping::Trace<std::decay_t<decltype(mapping)>, CountType>{mapping};
+        else
+            return mapping;
+    }();
 
     Stopwatch watch;
 
-    auto hostView = llama::allocViewUninitialized(mapping);
+    auto hostView = llama::allocViewUninitialized(tmapping);
     auto accView = llama::allocViewUninitialized(
-        mapping,
+        tmapping,
         [](auto alignment, std::size_t size)
         {
             std::byte* p = nullptr;
@@ -236,6 +250,8 @@ try
         p(tag::Mass()) = distribution(engine) / FP(100);
         hostView(i) = p;
     }
+    if constexpr(TRACE)
+        hostView.mapping().fieldHits(hostView.storageBlobs) = {};
 
     watch.printAndReset("init");
 
@@ -255,12 +271,15 @@ try
     };
 
     start();
-    for(auto i = 0; i < accView.storageBlobs.size(); i++)
+    const auto blobs = hostView.storageBlobs.size();
+    for(std::size_t i = 0; i < blobs; i++)
         checkError(cudaMemcpy(
             accView.storageBlobs[i],
             hostView.storageBlobs[i].data(),
-            mapping.blobSize(i),
+            hostView.mapping().blobSize(i),
             cudaMemcpyHostToDevice));
+    if constexpr(TRACE)
+        cudaMemset(accView.storageBlobs[blobs], 0, accView.mapping().blobSize(blobs)); // init trace count buffer
     std::cout << "copy H->D " << stop() << " s\n";
 
     const auto blocks = PROBLEM_SIZE / THREADS_PER_BLOCK;
@@ -290,15 +309,18 @@ try
     plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
 
     start();
-    for(auto i = 0; i < accView.storageBlobs.size(); i++)
+    for(std::size_t i = 0; i < blobs; i++)
         checkError(cudaMemcpy(
             hostView.storageBlobs[i].data(),
             accView.storageBlobs[i],
-            mapping.blobSize(i),
+            hostView.mapping().blobSize(i),
             cudaMemcpyDeviceToHost));
     std::cout << "copy D->H " << stop() << " s\n";
 
-    for(auto i = 0; i < accView.storageBlobs.size(); i++)
+    if constexpr(TRACE)
+        hostView.mapping().printFieldHits(hostView.storageBlobs);
+
+    for(std::size_t i = 0; i < accView.storageBlobs.size(); i++)
         checkError(cudaFree(accView.storageBlobs[i]));
     checkError(cudaEventDestroy(startEvent));
     checkError(cudaEventDestroy(stopEvent));
diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp
index 28b9d82ba2..5cf4ae2449 100644
--- a/examples/nbody/nbody.cpp
+++ b/examples/nbody/nbody.cpp
@@ -190,6 +190,8 @@ namespace usellama
             p(tag::Vel{}, tag::Z{}) = dist(engine) / FP(10);
             p(tag::Mass{}) = dist(engine) / FP(100);
         }
+        if constexpr(TRACE)
+            particles.mapping().fieldHits(particles.storageBlobs) = {};
         watch.printAndReset("init");
 
         double sumUpdate = 0;
@@ -207,7 +209,9 @@ namespace usellama
         plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
 
         if constexpr(HEATMAP)
-            std::ofstream("nbody_heatmap_" + mappingName(Mapping) + ".sh") << particles.mapping.toGnuplotScript();
+            std::ofstream("nbody_heatmap_" + mappingName(Mapping) + ".sh") << particles.mapping().toGnuplotScript();
+        if constexpr(TRACE)
+            particles.mapping().printFieldHits(particles.storageBlobs);
 
         return 0;
     }
diff --git a/include/llama/Array.hpp b/include/llama/Array.hpp
index 856235c9e2..32669f207b 100644
--- a/include/llama/Array.hpp
+++ b/include/llama/Array.hpp
@@ -24,6 +24,11 @@ namespace llama
             return N;
         }
 
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool
+        {
+            return N == 0;
+        }
+
         LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
         {
             return &element[0];
@@ -100,6 +105,11 @@ namespace llama
             return 0;
         }
 
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool
+        {
+            return true;
+        }
+
         LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
         {
             return nullptr;
diff --git a/include/llama/View.hpp b/include/llama/View.hpp
index a362eee89f..c057f9b1a0 100644
--- a/include/llama/View.hpp
+++ b/include/llama/View.hpp
@@ -376,11 +376,13 @@ namespace llama
             return static_cast<const Mapping&>(*this);
         }
 
+#if !(defined(_MSC_VER) && defined(__NVCC__))
         template<typename V>
         auto operator()(llama::ArrayIndex<V, ArrayIndex::rank>) const
         {
             static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent");
         }
+#endif
 
         /// Retrieves the \ref VirtualRecord at the given \ref ArrayIndex index.
         LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto)
@@ -451,11 +453,13 @@ namespace llama
             return (*this)(ai);
         }
 
+#if !(defined(_MSC_VER) && defined(__NVCC__))
         template<typename V>
         auto operator[](llama::ArrayIndex<V, ArrayIndex::rank>) const
         {
             static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent");
         }
+#endif
 
         /// Retrieves the \ref VirtualRecord at the 1D \ref ArrayIndex index constructed from the passed index.
         LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type index) const -> decltype(auto)
diff --git a/include/llama/mapping/Trace.hpp b/include/llama/mapping/Trace.hpp
index d56062e9ff..df7661dca5 100644
--- a/include/llama/mapping/Trace.hpp
+++ b/include/llama/mapping/Trace.hpp
@@ -2,16 +2,51 @@
 
 #include "Common.hpp"
 
-#include <array>
 #include <atomic>
+#include <cstdio>
 #include <iostream>
 
+#ifndef __cpp_lib_atomic_ref
+#    include <boost/atomic/atomic_ref.hpp>
+#endif
+
 namespace llama::mapping
 {
-    /// Forwards all calls to the inner mapping. Traces all accesses made through this mapping and prints a summary on
-    /// destruction.
-    /// \tparam Mapping The type of the inner mapping.
-    template<typename Mapping>
+    namespace internal
+    {
+        template<typename CountType>
+        LLAMA_FN_HOST_ACC_INLINE void atomicInc(CountType& i)
+        {
+#ifdef __CUDA_ARCH__
+            // if you get an error here that there is no overload of atomicAdd, your CMAKE_CUDA_ARCHITECTURE might be
+            // too low or you need to use a smaller CountType for the Trace mapping.
+            atomicAdd(&i, CountType{1});
+#elif defined(__cpp_lib_atomic_ref)
+            ++std::atomic_ref<CountType>{i};
+#else
+            ++boost::atomic_ref<CountType>{i};
+#endif
+        }
+    } // namespace internal
+
+    template<typename CountType>
+    struct AccessCounts
+    {
+        union
+        {
+            CountType memLocsComputed;
+            CountType reads;
+        };
+        CountType writes;
+    };
+
+    /// Forwards all calls to the inner mapping. Traces all accesses made through this mapping and allows printing a
+    /// summary.
+    /// /tparam Mapping The type of the inner mapping.
+    /// /tparam CountType The type used for counting the number of accesses.
+    /// /tparam MyCodeHandlesProxyReferences If false, Trace will avoid proxy references but can then only count
+    /// the number of address computations
+    template<typename Mapping, typename CountType = std::size_t, bool MyCodeHandlesProxyReferences = true>
     struct Trace : Mapping
     {
     private:
@@ -19,66 +54,127 @@ namespace llama::mapping
 
     public:
         using RecordDim = typename Mapping::RecordDim;
+        using FieldHitsArray = Array<AccessCounts<CountType>, flatFieldCount<RecordDim>>;
+
+        inline static constexpr auto blobCount = Mapping::blobCount + 1;
 
         constexpr Trace() = default;
 
         LLAMA_FN_HOST_ACC_INLINE
-        explicit Trace(Mapping mapping, bool printOnDestruction = true)
-            : Mapping(mapping)
-            , fieldHits{}
-            , printOnDestruction(printOnDestruction)
+        explicit Trace(Mapping mapping) : Mapping(std::move(mapping))
         {
         }
 
-        Trace(const Trace&) = delete;
-        auto operator=(const Trace&) -> Trace& = delete;
+        LLAMA_FN_HOST_ACC_INLINE
+        constexpr auto blobSize(size_type blobIndex) const -> size_type
+        {
+            if(blobIndex < size_type{Mapping::blobCount})
+                return inner().blobSize(blobIndex);
+            return sizeof(FieldHitsArray);
+        }
 
-        Trace(Trace&& other) noexcept
-            : Mapping(std::move(static_cast<Mapping&>(other)))
-            , printOnDestruction(other.printOnDestruction)
+        template<std::size_t... RecordCoords>
+        static constexpr auto isComputed(RecordCoord<RecordCoords...>)
         {
-            for(std::size_t i = 0; i < fieldHits.size(); i++)
-                fieldHits[i] = other.fieldHits[i].load();
-            other.printOnDestruction = false;
+            return true;
         }
 
-        auto operator=(Trace&& other) noexcept -> Trace&
+        template<std::size_t... RecordCoords, typename Blobs>
+        LLAMA_FN_HOST_ACC_INLINE auto compute(
+            typename Mapping::ArrayIndex ai,
+            RecordCoord<RecordCoords...> rc,
+            Blobs& blobs) const -> decltype(auto)
         {
-            static_cast<Mapping&>(*this) = std::move(static_cast<Mapping&>(other));
-            printOnDestruction = other.printOnDestruction;
-            for(std::size_t i = 0; i < fieldHits.size(); i++)
-                fieldHits[i] = other.fieldHits[i].load();
-            other.printOnDestruction = false;
-            return *this;
+            auto& hits = fieldHits(blobs)[+flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>];
+            auto&& ref = mapToMemory(inner(), ai, rc, blobs);
+            if constexpr(MyCodeHandlesProxyReferences)
+            {
+                using Ref = decltype(mapToMemory(inner(), ai, rc,
+                                                 blobs)); // T& or proxy reference
+                using VT = GetType<RecordDim, decltype(rc)>;
+                struct Reference : ProxyRefOpMixin<Reference, VT>
+                {
+                    using value_type = VT;
+
+                    Ref r;
+                    AccessCounts<CountType>* hits;
+
+                    LLAMA_FN_HOST_ACC_INLINE auto operator=(value_type t) -> Reference&
+                    {
+                        internal::atomicInc(hits->writes);
+                        r = t;
+                        return *this;
+                    }
+
+                    LLAMA_FN_HOST_ACC_INLINE operator value_type() const
+                    {
+                        internal::atomicInc(hits->reads);
+                        return static_cast<value_type>(r);
+                    }
+                };
+                return Reference{{}, std::forward<Ref>(ref), &hits};
+            }
+            else
+            {
+                internal::atomicInc(hits.memLocsComputed);
+                return ref;
+            }
         }
 
-        ~Trace()
+        template<typename Blobs>
+        LLAMA_FN_HOST_ACC_INLINE auto fieldHits(const Blobs& blobs) const -> const FieldHitsArray&
         {
-            if(printOnDestruction && !fieldHits.empty())
-                print();
+            return reinterpret_cast<const FieldHitsArray&>(*&blobs[blobCount - 1][0]);
         }
 
-        template<std::size_t... RecordCoords>
-        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(
-            typename Mapping::ArrayIndex ai,
-            RecordCoord<RecordCoords...> rc = {}) const -> NrAndOffset<size_type>
+        template<typename Blobs>
+        LLAMA_FN_HOST_ACC_INLINE auto fieldHits(Blobs& blobs) const -> FieldHitsArray&
         {
-            ++fieldHits[flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>];
-            return Mapping::blobNrAndOffset(ai, rc);
+            return const_cast<FieldHitsArray&>(fieldHits(std::as_const(blobs)));
         }
 
-        void print() const
+        template<typename Blobs>
+        LLAMA_FN_HOST_ACC_INLINE void printFieldHits(const Blobs& blobs) const
         {
-            std::cout << "Trace mapping, number of accesses:\n";
+            const auto& hits = fieldHits(blobs);
+#ifdef __CUDA_ARCH__
+            if constexpr(MyCodeHandlesProxyReferences)
+                printf("Trace mapping, number of accesses:\n");
+            else
+                printf("Trace mapping, number of memory locations computed:\n");
+
+            for(int i = 0; i < hits.size(); i++)
+                if constexpr(MyCodeHandlesProxyReferences)
+                    printf(
+                        "\t%i:\tR: %lu\tW: %lu\n",
+                        i,
+                        static_cast<unsigned long>(hits[i].reads),
+                        static_cast<unsigned long>(hits[i].writes));
+                else
+                    printf("\t%i:\t%lu\n", i, static_cast<unsigned long>(hits[i].memLocsComputed));
+#else
+            if constexpr(MyCodeHandlesProxyReferences)
+                std::cout << "Trace mapping, number of accesses:\n";
+            else
+                std::cout << "Trace mapping, number of memory locations computed:\n";
             forEachLeafCoord<RecordDim>(
-                [this](auto rc)
+                [&](auto rc)
                 {
-                    std::cout << '\t' << recordCoordTags<RecordDim>(rc) << ":\t"
-                              << fieldHits[flatRecordCoord<RecordDim, decltype(rc)>] << '\n';
+                    const size_type i = flatRecordCoord<RecordDim, decltype(rc)>;
+                    if constexpr(MyCodeHandlesProxyReferences)
+                        std::cout << '\t' << recordCoordTags<RecordDim>(rc) << ":\tR: " << hits[i].reads
+                                  << "\tW: " << hits[i].writes << '\n';
+                    else
+                        std::cout << '\t' << recordCoordTags<RecordDim>(rc) << ":\t " << hits[i].memLocsComputed
+                                  << '\n';
                 });
+#endif
         }
 
-        mutable std::array<std::atomic<std::size_t>, flatFieldCount<RecordDim>> fieldHits;
-        bool printOnDestruction;
+    private:
+        LLAMA_FN_HOST_ACC_INLINE auto inner() const -> const Mapping&
+        {
+            return static_cast<const Mapping&>(*this);
+        }
     };
 } // namespace llama::mapping
diff --git a/tests/array.cpp b/tests/array.cpp
index 7f4335351d..b1324eea50 100644
--- a/tests/array.cpp
+++ b/tests/array.cpp
@@ -1,5 +1,11 @@
 #include "common.hpp"
 
+TEST_CASE("Array.empty")
+{
+    STATIC_REQUIRE(llama::Array<int, 0>{}.empty());
+    STATIC_REQUIRE(!llama::Array{1}.empty());
+}
+
 TEST_CASE("Array.operator<<")
 {
     auto put = [](auto array)
diff --git a/tests/mapping.HeatmapTrace.cpp b/tests/mapping.HeatmapTrace.cpp
index cf0b70dbaa..381d6eb7d8 100644
--- a/tests/mapping.HeatmapTrace.cpp
+++ b/tests/mapping.HeatmapTrace.cpp
@@ -57,46 +57,81 @@ TEST_CASE("Heatmap.nbody")
         llama::mapping::SingleBlobSoA<llama::ArrayExtents<std::size_t, N>, ParticleHeatmap>{});
 }
 
-TEST_CASE("Trace.nbody")
+TEMPLATE_LIST_TEST_CASE("Trace.nbody.mem_locs_computed", "", SizeTypes)
 {
     auto run = [&](auto mapping)
     {
-        auto particles = llama::allocView(llama::mapping::Trace{mapping, false});
+        auto particles = llama::allocView(llama::mapping::Trace<decltype(mapping), TestType, false>{mapping});
         updateAndMove(particles);
-        auto& hits = particles.mapping().fieldHits;
+        auto& hits = particles.mapping().fieldHits(particles.storageBlobs);
         CHECK(hits.size() == 7);
-        CHECK(hits[0] == 10400);
-        CHECK(hits[1] == 10400);
-        CHECK(hits[2] == 10400);
-        CHECK(hits[3] == 400);
-        CHECK(hits[4] == 400);
-        CHECK(hits[5] == 400);
-        CHECK(hits[6] == 10300);
+        CHECK(hits[0].memLocsComputed == 10300);
+        CHECK(hits[1].memLocsComputed == 10300);
+        CHECK(hits[2].memLocsComputed == 10300);
+        CHECK(hits[3].memLocsComputed == 300);
+        CHECK(hits[4].memLocsComputed == 300);
+        CHECK(hits[5].memLocsComputed == 300);
+        CHECK(hits[6].memLocsComputed == 10200);
+
+        std::stringstream buffer;
+        std::streambuf* old = std::cout.rdbuf(buffer.rdbuf());
+        particles.mapping().printFieldHits(particles.storageBlobs);
+        std::cout.rdbuf(old);
+        CHECK(
+            buffer.str()
+            == "Trace mapping, number of memory locations computed:\n"
+               "\tPos.X:\t 10300\n"
+               "\tPos.Y:\t 10300\n"
+               "\tPos.Z:\t 10300\n"
+               "\tVel.X:\t 300\n"
+               "\tVel.Y:\t 300\n"
+               "\tVel.Z:\t 300\n"
+               "\tMass:\t 10200\n");
     };
     run(llama::mapping::AlignedAoS<llama::ArrayExtents<std::size_t, N>, ParticleHeatmap>{});
     run(llama::mapping::SingleBlobSoA<llama::ArrayExtents<std::size_t, N>, ParticleHeatmap>{});
 }
 
-TEST_CASE("Trace.print_dtor")
+TEMPLATE_LIST_TEST_CASE("Trace.nbody.reads_writes", "", SizeTypes)
 {
-    std::stringstream buffer;
-    std::streambuf* old = std::cout.rdbuf(buffer.rdbuf());
+    auto run = [&](auto mapping)
     {
-        auto particles = llama::allocView(
-            llama::mapping::Trace{llama::mapping::AlignedAoS<llama::ArrayExtents<std::size_t, N>, ParticleHeatmap>{}});
+        auto particles = llama::allocView(llama::mapping::Trace<decltype(mapping), TestType>{mapping});
         updateAndMove(particles);
-    }
-    std::cout.rdbuf(old);
-    CHECK(
-        buffer.str()
-        == "Trace mapping, number of accesses:\n"
-           "\tPos.X:\t10400\n"
-           "\tPos.Y:\t10400\n"
-           "\tPos.Z:\t10400\n"
-           "\tVel.X:\t400\n"
-           "\tVel.Y:\t400\n"
-           "\tVel.Z:\t400\n"
-           "\tMass:\t10300\n");
+        auto& hits = particles.mapping().fieldHits(particles.storageBlobs);
+        CHECK(hits.size() == 7);
+        CHECK(hits[0].reads == 10200);
+        CHECK(hits[1].reads == 10200);
+        CHECK(hits[2].reads == 10200);
+        CHECK(hits[3].reads == 200);
+        CHECK(hits[4].reads == 200);
+        CHECK(hits[5].reads == 200);
+        CHECK(hits[6].reads == 10100);
+        CHECK(hits[0].writes == 200);
+        CHECK(hits[1].writes == 200);
+        CHECK(hits[2].writes == 200);
+        CHECK(hits[3].writes == 100);
+        CHECK(hits[4].writes == 100);
+        CHECK(hits[5].writes == 100);
+        CHECK(hits[6].writes == 100);
+
+        std::stringstream buffer;
+        std::streambuf* old = std::cout.rdbuf(buffer.rdbuf());
+        particles.mapping().printFieldHits(particles.storageBlobs);
+        std::cout.rdbuf(old);
+        CHECK(
+            buffer.str()
+            == "Trace mapping, number of accesses:\n"
+               "\tPos.X:\tR: 10200\tW: 200\n"
+               "\tPos.Y:\tR: 10200\tW: 200\n"
+               "\tPos.Z:\tR: 10200\tW: 200\n"
+               "\tVel.X:\tR: 200\tW: 100\n"
+               "\tVel.Y:\tR: 200\tW: 100\n"
+               "\tVel.Z:\tR: 200\tW: 100\n"
+               "\tMass:\tR: 10100\tW: 100\n");
+    };
+    run(llama::mapping::AlignedAoS<llama::ArrayExtents<std::size_t, N>, ParticleHeatmap>{});
+    run(llama::mapping::SingleBlobSoA<llama::ArrayExtents<std::size_t, N>, ParticleHeatmap>{});
 }
 
 namespace
diff --git a/tests/mapping.cpp b/tests/mapping.cpp
index 54a15775a9..fac2a70b22 100644
--- a/tests/mapping.cpp
+++ b/tests/mapping.cpp
@@ -21,7 +21,7 @@ TEMPLATE_LIST_TEST_CASE("mapping.concepts", "", SizeTypes)
     STATIC_REQUIRE(llama::PhysicalMapping<llama::mapping::AoSoA<llama::ArrayExtents<TestType, 2>, Particle, 8>>);
 
     using Inner = llama::mapping::AlignedAoS<llama::ArrayExtentsDynamic<TestType, 2>, Particle>;
-    STATIC_REQUIRE(llama::PhysicalMapping<llama::mapping::Trace<Inner>>);
+    STATIC_REQUIRE(llama::FullyComputedMapping<llama::mapping::Trace<Inner>>);
     STATIC_REQUIRE(llama::PhysicalMapping<llama::mapping::Heatmap<Inner>>);
 
     STATIC_REQUIRE(