From b469ad6c49eb3505a5e8cb9fcaf21186129f84c2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 27 Jul 2023 23:45:18 +0200
Subject: [PATCH 01/13] nlohmann_json refactor

---
 CMakeLists.txt                                |    2 +-
 benchmark/CMakeLists.txt                      |    4 +-
 benchmark/blas/blas.cpp                       |   21 +-
 benchmark/blas/blas_common.hpp                |  247 +--
 benchmark/blas/distributed/multi_vector.cpp   |   30 +-
 .../CMakeLists.txt                            |    2 +-
 benchmark/conversion/conversion.cpp           |  194 ++
 benchmark/conversions/conversions.cpp         |  223 --
 .../matrix_generator/matrix_generator.cpp     |   36 +-
 .../matrix_statistics/matrix_statistics.cpp   |  183 +-
 benchmark/preconditioner/preconditioner.cpp   |  238 +--
 benchmark/solver/distributed/solver.cpp       |   32 +-
 benchmark/solver/solver.cpp                   |   21 +-
 benchmark/solver/solver_common.hpp            |  406 ++--
 benchmark/sparse_blas/operations.cpp          |   13 +-
 benchmark/sparse_blas/operations.hpp          |    8 +-
 benchmark/sparse_blas/sparse_blas.cpp         |  191 +-
 benchmark/spmv/distributed/spmv.cpp           |   67 +-
 benchmark/spmv/spmv.cpp                       |   33 +-
 benchmark/spmv/spmv_common.hpp                |  289 ++-
 benchmark/test/reference/blas.profile.stderr  |   69 +-
 benchmark/test/reference/blas.simple.stderr   |   69 +-
 .../test/reference/conversion.all.stderr      | 1862 +----------------
 .../test/reference/conversion.all.stdout      |   74 +-
 .../test/reference/conversion.matrix.stderr   |   42 +-
 .../test/reference/conversion.matrix.stdout   |   16 +-
 .../test/reference/conversion.profile.stderr  |   98 +-
 .../test/reference/conversion.profile.stdout  |   19 +-
 .../test/reference/conversion.simple.stderr   |   42 +-
 .../test/reference/conversion.simple.stdout   |   19 +-
 .../distributed_solver.matrix.stdout          |    3 +-
 .../distributed_solver.profile.stderr         |    8 +-
 .../distributed_solver.profile.stdout         |    6 +-
 .../distributed_solver.simple.stdout          |    6 +-
 .../reference/matrix_statistics.matrix.stderr |    2 +-
 .../reference/matrix_statistics.matrix.stdout |    4 +-
 .../reference/matrix_statistics.simple.stderr |    2 +-
 .../reference/matrix_statistics.simple.stdout |    7 +-
 .../reference/preconditioner.matrix.stderr    |   33 +-
 .../reference/preconditioner.matrix.stdout    |    4 +-
 .../reference/preconditioner.profile.stderr   |   29 +-
 .../reference/preconditioner.profile.stdout   |    7 +-
 .../reference/preconditioner.simple.stderr    |   33 +-
 .../reference/preconditioner.simple.stdout    |    7 +-
 benchmark/test/reference/solver.matrix.stdout |    3 +-
 .../test/reference/solver.profile.stderr      |    8 +-
 .../test/reference/solver.profile.stdout      |    6 +-
 benchmark/test/reference/solver.simple.stdout |    6 +-
 .../test/reference/sparse_blas.matrix.stderr  |   29 +-
 .../test/reference/sparse_blas.profile.stderr |   23 +-
 .../test/reference/sparse_blas.simple.stderr  |   30 +-
 benchmark/test/reference/spmv.matrix.stderr   |   21 +-
 benchmark/test/reference/spmv.matrix.stdout   |    5 +-
 benchmark/test/reference/spmv.profile.stderr  |   32 +-
 benchmark/test/reference/spmv.profile.stdout  |    6 +-
 benchmark/test/reference/spmv.simple.stderr   |   21 +-
 benchmark/test/reference/spmv.simple.stdout   |    6 +-
 benchmark/utils/general.hpp                   |  342 +--
 benchmark/utils/general_matrix.hpp            |   18 +-
 benchmark/utils/generator.hpp                 |  118 +-
 benchmark/utils/iteration_control.hpp         |  326 +++
 benchmark/utils/json.hpp                      |   63 +-
 benchmark/utils/loggers.hpp                   |  100 +-
 benchmark/utils/runner.hpp                    |  209 ++
 benchmark/utils/spmv_validation.hpp           |   83 -
 third_party/CMakeLists.txt                    |    4 +-
 third_party/nlohmann_json/CMakeLists.txt      |    9 +
 third_party/rapidjson/CMakeLists.txt          |   14 -
 68 files changed, 1864 insertions(+), 4319 deletions(-)
 rename benchmark/{conversions => conversion}/CMakeLists.txt (88%)
 create mode 100644 benchmark/conversion/conversion.cpp
 delete mode 100644 benchmark/conversions/conversions.cpp
 create mode 100644 benchmark/utils/iteration_control.hpp
 create mode 100644 benchmark/utils/runner.hpp
 delete mode 100644 benchmark/utils/spmv_validation.hpp
 create mode 100644 third_party/nlohmann_json/CMakeLists.txt
 delete mode 100644 third_party/rapidjson/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1c9f22b4db1..a483f09a0d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,7 +256,7 @@ if(GINKGO_BUILD_TESTS)
 endif()
 if(GINKGO_BUILD_BENCHMARKS)
     find_package(gflags 2.2.2 QUIET)
-    find_package(RapidJSON 1.1.0 QUIET)
+    find_package(nlohmann_json 3.9.1 QUIET)
 endif()
 
 # System provided, third party libraries (not bundled!)
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 44a0a3d1d9e..e993ee6cf0c 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -57,7 +57,7 @@ endfunction()
 # All remaining arguments will be treated as source files
 function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def type)
     add_executable("${name}" ${ARGN})
-    target_link_libraries("${name}" ginkgo gflags rapidjson)
+    target_link_libraries("${name}" ginkgo gflags nlohmann_json::nlohmann_json)
     # always include the device timer
     if (GINKGO_BUILD_CUDA)
         target_compile_definitions("${name}" PRIVATE HAS_CUDA_TIMER=1)
@@ -149,7 +149,7 @@ if (GINKGO_BUILD_MPI)
 endif()
 
 add_subdirectory(blas)
-add_subdirectory(conversions)
+add_subdirectory(conversion)
 add_subdirectory(matrix_generator)
 add_subdirectory(matrix_statistics)
 add_subdirectory(preconditioner)
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index 11228ed5818..f7ad8120a80 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -130,26 +130,17 @@ Parameters for a benchmark case are:
     stride_B: stride for B matrix in gemm (optional, default m)
     stride_C: stride for C matrix in gemm (optional, default m)
 )";
-    std::string format = example_config;
+    std::string format = Generator::get_example_config();
     initialize_argument_parsing(&argc, &argv, header, format);
 
-    std::string extra_information =
-        "The operations are " + FLAGS_operations + "\n";
+    std::string extra_information = "The operations are " + FLAGS_operations;
     print_general_information(extra_information);
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        std::cerr
-            << "Input has to be a JSON array of benchmark configurations:\n"
-            << format;
-        std::exit(1);
-    }
+    auto test_cases = json::parse(get_input_stream());
 
-    run_blas_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), operation_map,
-                        test_cases, true);
+    run_test_cases(BlasBenchmark{operation_map}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
index fe0110f82fb..88819a043b0 100644
--- a/benchmark/blas/blas_common.hpp
+++ b/benchmark/blas/blas_common.hpp
@@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -70,14 +72,6 @@ DEFINE_string(
     "C has dimensions n x m and x and y have dimensions n x r");
 
 
-std::string example_config = R"(
-  [
-    { "n": 100 },
-    { "n": 200, "m": 200, "k": 200 }
-  ]
-)";
-
-
 class BenchmarkOperation {
 public:
     virtual ~BenchmarkOperation() = default;
@@ -404,70 +398,101 @@ struct dimensions {
 };
 
 
-dimensions parse_dims(rapidjson::Value& test_case)
-{
-    auto get_optional = [](rapidjson::Value& obj, const char* name,
-                           gko::size_type default_value) -> gko::size_type {
-        if (obj.HasMember(name)) {
-            return obj[name].GetUint64();
-        } else {
-            return default_value;
-        }
-    };
-
-    dimensions result;
-    result.n = test_case["n"].GetInt64();
-    result.k = get_optional(test_case, "k", result.n);
-    result.m = get_optional(test_case, "m", result.n);
-    result.r = get_optional(test_case, "r", 1);
-    if (test_case.HasMember("stride")) {
-        result.stride_x = test_case["stride"].GetInt64();
-        result.stride_y = result.stride_x;
-    } else {
-        result.stride_x = get_optional(test_case, "stride_x", result.r);
-        result.stride_y = get_optional(test_case, "stride_y", result.r);
+struct BlasBenchmark : Benchmark<dimensions> {
+    using map_type =
+        std::map<std::string,
+                 std::function<std::unique_ptr<BenchmarkOperation>(
+                     std::shared_ptr<const gko::Executor>, dimensions)>>;
+    map_type operation_map;
+    std::vector<std::string> operations;
+    std::string name;
+    bool do_print;
+
+    BlasBenchmark(map_type operation_map, bool do_print = true)
+        : operation_map{std::move(operation_map)},
+          name{"blas"},
+          operations{split(FLAGS_operations)},
+          do_print{do_print}
+    {}
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return operations;
     }
-    result.stride_A = get_optional(test_case, "stride_A", result.k);
-    result.stride_B = get_optional(test_case, "stride_B", result.m);
-    result.stride_C = get_optional(test_case, "stride_C", result.m);
-    return result;
-}
 
+    bool should_print() const override { return do_print; }
 
-std::string describe(rapidjson::Value& test_case)
-{
-    std::stringstream ss;
-    auto optional_output = [&](const char* name) {
-        if (test_case.HasMember(name) && test_case[name].IsInt64()) {
-            ss << name << " = " << test_case[name].GetInt64() << " ";
-        }
-    };
-    optional_output("n");
-    optional_output("k");
-    optional_output("m");
-    optional_output("r");
-    optional_output("stride");
-    optional_output("stride_x");
-    optional_output("stride_y");
-    optional_output("stride_A");
-    optional_output("stride_B");
-    optional_output("stride_C");
-    return ss.str();
-}
+    std::string get_example_config() const override
+    {
+        return json::parse(R"([{"n": 100}, {"n": 200, "m": 200, "k": 200}])")
+            .dump(4);
+    }
 
+    bool validate_config(const json& value) const override
+    {
+        return value.contains("n") && value["n"].is_number_integer();
+    }
 
-template <typename OpMap>
-void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
-                std::shared_ptr<Timer> timer, const OpMap& operation_map,
-                rapidjson::Value& test_case,
-                rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& blas_case = test_case["blas"];
-        add_or_set_member(blas_case, operation_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+    std::string describe_config(const json& test_case) const override
+    {
+        std::stringstream ss;
+        auto optional_output = [&](const char* name) {
+            if (test_case.contains(name) &&
+                test_case[name].is_number_integer()) {
+                ss << name << " = " << test_case[name].get<gko::int64>() << " ";
+            }
+        };
+        optional_output("n");
+        optional_output("k");
+        optional_output("m");
+        optional_output("r");
+        optional_output("stride");
+        optional_output("stride_x");
+        optional_output("stride_y");
+        optional_output("stride_A");
+        optional_output("stride_B");
+        optional_output("stride_C");
+        return ss.str();
+    }
+
+    dimensions setup(std::shared_ptr<gko::Executor> exec,
+                     json& test_case) const override
+    {
+        auto get_optional = [](json& obj, const char* name,
+                               gko::size_type default_value) -> gko::size_type {
+            if (obj.contains(name)) {
+                return obj[name].get<gko::uint64>();
+            } else {
+                return default_value;
+            }
+        };
+
+        dimensions result;
+        result.n = test_case["n"].get<gko::int64>();
+        result.k = get_optional(test_case, "k", result.n);
+        result.m = get_optional(test_case, "m", result.n);
+        result.r = get_optional(test_case, "r", 1);
+        if (test_case.contains("stride")) {
+            result.stride_x = test_case["stride"].get<gko::int64>();
+            result.stride_y = result.stride_x;
+        } else {
+            result.stride_x = get_optional(test_case, "stride_x", result.r);
+            result.stride_y = get_optional(test_case, "stride_y", result.r);
+        }
+        result.stride_A = get_optional(test_case, "stride_A", result.k);
+        result.stride_B = get_optional(test_case, "stride_B", result.m);
+        result.stride_C = get_optional(test_case, "stride_C", result.m);
+        return result;
+    }
 
-        auto op = operation_map.at(operation_name)(exec, parse_dims(test_case));
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             dimensions& dims, const std::string& operation_name,
+             json& operation_case) const override
+    {
+        auto op = operation_map.at(operation_name)(exec, dims);
 
         IterationControl ic(timer);
 
@@ -488,89 +513,9 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
         const auto flops = static_cast<double>(op->get_flops());
         const auto mem = static_cast<double>(op->get_memory());
         const auto repetitions = ic.get_num_repetitions();
-        add_or_set_member(blas_case[operation_name], "time", runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
-                          allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(blas_case[operation_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["blas"][operation_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["blas"][operation_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case\n"
-                  << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
-template <typename OpMap>
-void run_blas_benchmarks(std::shared_ptr<gko::Executor> exec,
-                         std::shared_ptr<Timer> timer,
-                         const OpMap& operation_map,
-                         rapidjson::Document& test_cases, bool do_print)
-{
-    auto operations = split(FLAGS_operations, ',');
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
+        operation_case["time"] = runtime;
+        operation_case["flops"] = flops / runtime;
+        operation_case["bandwidth"] = mem / runtime;
+        operation_case["repetitions"] = repetitions;
     }
-    auto annotate = annotate_functor{profiler_hook};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            if (!test_case.HasMember("blas")) {
-                test_case.AddMember("blas",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& blas_case = test_case["blas"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(operations), end(operations),
-                       [&blas_case](const std::string& s) {
-                           return blas_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            if (do_print) {
-                std::clog << "Running test case\n" << test_case << std::endl;
-            }
-            // annotate the test case
-            auto test_case_range = annotate(describe(test_case));
-            for (const auto& operation_name : operations) {
-                {
-                    auto operation_range = annotate(operation_name.c_str());
-                    apply_blas(operation_name.c_str(), exec, timer,
-                               operation_map, test_case, allocator);
-                }
-
-                if (do_print) {
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-
-                    backup_results(test_cases);
-                }
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up benchmark, what(): " << e.what()
-                      << std::endl;
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-}
+};
diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp
index be326b08b96..d95e5fb38ac 100644
--- a/benchmark/blas/distributed/multi_vector.cpp
+++ b/benchmark/blas/distributed/multi_vector.cpp
@@ -50,6 +50,10 @@ int main(int argc, char* argv[])
 {
     gko::experimental::mpi::environment mpi_env{argc, argv};
 
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+    const auto do_print = rank == 0;
+
     std::string header = R"("
 A benchmark for measuring performance of Ginkgo's BLAS-like "
 operations.
@@ -60,13 +64,10 @@ Parameters for a benchmark case are:
     stride_x: stride for input vector x (optional, default r)
     stride_y: stride for in/out vector y (optional, default r)
 )";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing(&argc, &argv, header, format, do_print);
 
-    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    const auto rank = comm.rank();
-
-    if (rank == 0) {
+    if (do_print) {
         std::string extra_information =
             "The operations are " + FLAGS_operations;
         print_general_information(extra_information);
@@ -75,14 +76,7 @@ Parameters for a benchmark case are:
     auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
 
     std::string json_input = broadcast_json_input(get_input_stream(), comm);
-    rapidjson::Document test_cases;
-    test_cases.Parse(json_input.c_str());
-    if (!test_cases.IsArray()) {
-        std::cerr
-            << "Input has to be a JSON array of benchmark configurations:\n"
-            << format;
-        std::exit(1);
-    }
+    auto test_cases = json::parse(json_input);
 
     std::map<std::string,
              std::function<std::unique_ptr<BenchmarkOperation>(
@@ -130,10 +124,10 @@ Parameters for a benchmark case are:
                      exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y);
              }}};
 
-    run_blas_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer),
-                        operation_map, test_cases, rank == 0);
+    run_test_cases(BlasBenchmark{operation_map, do_print}, exec,
+                   get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases);
 
-    if (rank == 0) {
-        std::cout << test_cases << std::endl;
+    if (do_print) {
+        std::cout << std::setw(4) << test_cases << std::endl;
     }
 }
diff --git a/benchmark/conversions/CMakeLists.txt b/benchmark/conversion/CMakeLists.txt
similarity index 88%
rename from benchmark/conversions/CMakeLists.txt
rename to benchmark/conversion/CMakeLists.txt
index 21dd363d3c0..7ecf578c055 100644
--- a/benchmark/conversions/CMakeLists.txt
+++ b/benchmark/conversion/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_add_typed_benchmark_executables(conversion "NO" conversions.cpp)
+ginkgo_add_typed_benchmark_executables(conversion "NO" conversion.cpp)
diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
new file mode 100644
index 00000000000..b9a5d5c46d6
--- /dev/null
+++ b/benchmark/conversion/conversion.cpp
@@ -0,0 +1,194 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+
+#include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general_matrix.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/iteration_control.hpp"
+#include "benchmark/utils/runner.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+
+
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+using Generator = DefaultSystemGenerator<>;
+
+
+struct ConversionBenchmark : Benchmark<gko::matrix_data<etype, itype>> {
+    std::string name;
+    std::vector<std::string> operations;
+
+    ConversionBenchmark() : name{"conversion"}
+    {
+        auto ref_exec = gko::ReferenceExecutor::create();
+        auto formats = split(FLAGS_formats);
+        for (const auto& from_format : formats) {
+            operations.push_back(from_format + "-read");
+            auto from_mtx =
+                formats::matrix_type_factory.at(from_format)(ref_exec);
+            // all pairs of conversions that are supported by Ginkgo
+            for (const auto& to_format : formats) {
+                if (from_format == to_format) {
+                    continue;
+                }
+                auto to_mtx =
+                    formats::matrix_type_factory.at(to_format)(ref_exec);
+                try {
+                    to_mtx->copy_from(from_mtx);
+                    operations.push_back(from_format + "-" + to_format);
+                } catch (const std::exception& e) {
+                }
+            }
+        }
+    }
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return operations;
+    }
+
+    bool should_print() const override { return true; }
+
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
+
+    bool validate_config(const json& test_case) const override
+    {
+        return Generator::validate_config(test_case);
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    gko::matrix_data<etype, itype> setup(std::shared_ptr<gko::Executor> exec,
+                                         json& test_case) const override
+    {
+        gko::matrix_data<etype, itype> data;
+        data = Generator::generate_matrix_data(test_case);
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+        return data;
+    }
+
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             gko::matrix_data<etype, itype>& data,
+             const std::string& operation_name,
+             json& operation_case) const override
+    {
+        auto split_it =
+            std::find(operation_name.begin(), operation_name.end(), '-');
+        std::string from_name{operation_name.begin(), split_it};
+        std::string to_name{split_it + 1, operation_name.end()};
+        auto mtx_from = formats::matrix_type_factory.at(from_name)(exec);
+        auto readable =
+            gko::as<gko::ReadableFromMatrixData<etype, itype>>(mtx_from.get());
+        IterationControl ic{timer};
+        if (to_name == "read") {
+            // warm run
+            for (auto _ : ic.warmup_run()) {
+                exec->synchronize();
+                readable->read(data);
+                exec->synchronize();
+            }
+            // timed run
+            for (auto _ : ic.run()) {
+                readable->read(data);
+            }
+        } else {
+            readable->read(data);
+            auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);
+
+            // warm run
+            for (auto _ : ic.warmup_run()) {
+                exec->synchronize();
+                mtx_to->copy_from(mtx_from);
+                exec->synchronize();
+            }
+            // timed run
+            for (auto _ : ic.run()) {
+                mtx_to->copy_from(mtx_from);
+            }
+        }
+        operation_case["time"] = ic.compute_time(FLAGS_timer_method);
+        operation_case["repetitions"] = ic.get_num_repetitions();
+    }
+};
+
+
+int main(int argc, char* argv[])
+{
+    std::string header =
+        "A benchmark for measuring performance of Ginkgo's conversions.\n";
+    std::string format_str = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format_str);
+
+    std::string extra_information =
+        std::string() + "The formats are " + FLAGS_formats;
+    print_general_information(extra_information);
+
+    auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
+    auto formats = split(FLAGS_formats, ',');
+
+    auto test_cases = json::parse(get_input_stream());
+
+    run_test_cases(ConversionBenchmark{}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
+
+    std::cout << std::setw(4) << test_cases << std::endl;
+}
diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
deleted file mode 100644
index d9684321e2d..00000000000
--- a/benchmark/conversions/conversions.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/ginkgo.hpp>
-
-
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <exception>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <typeinfo>
-
-
-#include "benchmark/utils/formats.hpp"
-#include "benchmark/utils/general_matrix.hpp"
-#include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
-#include "benchmark/utils/timer.hpp"
-#include "benchmark/utils/types.hpp"
-
-
-#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
-#include "benchmark/utils/tuning_variables.hpp"
-#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
-
-
-// This function supposes that management of `FLAGS_overwrite` is done before
-// calling it
-void convert_matrix(const gko::LinOp* matrix_from, const char* format_to,
-                    const char* conversion_name,
-                    std::shared_ptr<gko::Executor> exec,
-                    rapidjson::Value& test_case,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& conversion_case = test_case["conversions"];
-        add_or_set_member(conversion_case, conversion_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-
-        gko::matrix_data<etype, itype> data{gko::dim<2>{1, 1}, 1};
-        auto matrix_to = share(formats::matrix_factory(format_to, exec, data));
-
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
-        IterationControl ic{timer};
-
-        // warm run
-        for (auto _ : ic.warmup_run()) {
-            exec->synchronize();
-            matrix_to->copy_from(matrix_from);
-            exec->synchronize();
-            matrix_to->clear();
-        }
-        // timed run
-        for (auto _ : ic.run()) {
-            matrix_to->copy_from(matrix_from);
-        }
-        add_or_set_member(conversion_case[conversion_name], "time",
-                          ic.compute_time(FLAGS_timer_method), allocator);
-        add_or_set_member(conversion_case[conversion_name], "repetitions",
-                          ic.get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(conversion_case[conversion_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["conversions"][conversion_name],
-                          "completed", false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["conversions"][conversion_name],
-                              "error", msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case\n"
-                  << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
-int main(int argc, char* argv[])
-{
-    std::string header =
-        "A benchmark for measuring performance of Ginkgo's conversions.\n";
-    std::string format_str = example_config;
-    initialize_argument_parsing_matrix(&argc, &argv, header, format_str);
-
-    std::string extra_information =
-        std::string() + "The formats are " + FLAGS_formats + "\n";
-    print_general_information(extra_information);
-
-    auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    auto formats = split(FLAGS_formats, ',');
-
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
-
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    DefaultSystemGenerator<> generator{};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        std::clog << "Benchmarking conversions. " << std::endl;
-        // set up benchmark
-        validate_option_object(test_case);
-        if (!test_case.HasMember("conversions")) {
-            test_case.AddMember("conversions",
-                                rapidjson::Value(rapidjson::kObjectType),
-                                allocator);
-        }
-        auto& conversion_case = test_case["conversions"];
-
-        std::clog << "Running test case\n" << test_case << std::endl;
-        gko::matrix_data<etype, itype> data;
-        try {
-            data = generator.generate_matrix_data(test_case);
-        } catch (std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
-            }
-            continue;
-        }
-        std::clog << "Matrix is of size (" << data.size[0] << ", "
-                  << data.size[1] << ")" << std::endl;
-        add_or_set_member(test_case, "size", data.size[0], allocator);
-        // annotate the test case
-        auto test_case_range = annotate(generator.describe_config(test_case));
-        for (const auto& format_from : formats) {
-            try {
-                auto matrix_from =
-                    share(formats::matrix_factory(format_from, exec, data));
-                for (const auto& format_to : formats) {
-                    if (format_from == format_to) {
-                        continue;
-                    }
-                    auto conversion_name =
-                        std::string(format_from) + "-" + format_to;
-
-                    if (!FLAGS_overwrite &&
-                        conversion_case.HasMember(conversion_name.c_str())) {
-                        continue;
-                    }
-                    {
-                        auto conversion_range =
-                            annotate(conversion_name.c_str());
-                        convert_matrix(matrix_from.get(), format_to.c_str(),
-                                       conversion_name.c_str(), exec, test_case,
-                                       allocator);
-                    }
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-                }
-                backup_results(test_cases);
-            } catch (const gko::AllocationError& e) {
-                for (const auto& format : formats::matrix_type_factory) {
-                    const auto format_to = std::get<0>(format);
-                    auto conversion_name =
-                        std::string(format_from) + "-" + format_to;
-                    add_or_set_member(
-                        test_case["conversions"][conversion_name.c_str()],
-                        "completed", false, allocator);
-                }
-                std::cerr << "Error when allocating data for type "
-                          << format_from << ". what(): " << e.what()
-                          << std::endl;
-                backup_results(test_cases);
-            } catch (const std::exception& e) {
-                std::cerr << "Error when running benchmark, what(): "
-                          << e.what() << std::endl;
-            }
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-
-    std::cout << test_cases << std::endl;
-}
diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp
index 138b5a9c2ce..193d95f897f 100644
--- a/benchmark/matrix_generator/matrix_generator.cpp
+++ b/benchmark/matrix_generator/matrix_generator.cpp
@@ -85,31 +85,33 @@ std::string input_format =
 // clang-format on
 
 
-void validate_option_object(const rapidjson::Value& value)
+void validate_option_object(const json& value)
 {
-    if (!value.IsObject() || !value.HasMember("filename") ||
-        !value["filename"].IsString() || !value.HasMember("problem") ||
-        !value["problem"].IsObject() || !value["problem"].HasMember("type") ||
-        !value["problem"]["type"].IsString()) {
+    if (!value.is_object() || !value.contains("filename") ||
+        !value["filename"].is_string() || !value.contains("problem") ||
+        !value["problem"].is_object() || !value["problem"].contains("type") ||
+        !value["problem"]["type"].is_string()) {
         print_config_error_and_exit(2);
     }
 }
 
 
 using generator_function = std::function<gko::matrix_data<etype, itype>(
-    rapidjson::Value&, std::default_random_engine&)>;
+    json&, std::default_random_engine&)>;
 
 
 // matrix generators
 gko::matrix_data<etype, itype> generate_block_diagonal(
-    rapidjson::Value& config, std::default_random_engine& engine)
+    json& config, std::default_random_engine& engine)
 {
-    if (!config.HasMember("num_blocks") || !config["num_blocks"].IsUint() ||
-        !config.HasMember("block_size") || !config["block_size"].IsUint()) {
+    if (!config.contains("num_blocks") ||
+        !config["num_blocks"].is_number_unsigned() ||
+        !config.contains("block_size") ||
+        !config["block_size"].is_number_unsigned()) {
         print_config_error_and_exit(2);
     }
-    auto num_blocks = config["num_blocks"].GetUint();
-    auto block_size = config["block_size"].GetUint();
+    auto num_blocks = config["num_blocks"].get<gko::uint64>();
+    auto block_size = config["block_size"].get<gko::uint64>();
     auto block = gko::matrix_data<etype, itype>(
         gko::dim<2>(block_size),
         std::uniform_real_distribution<rc_etype>(-1.0, 1.0), engine);
@@ -132,20 +134,18 @@ int main(int argc, char* argv[])
     std::clog << gko::version_info::get() << std::endl;
 
     auto engine = get_engine();
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document configurations;
-    configurations.ParseStream(jcin);
+    auto configurations = json::parse(get_input_stream());
 
-    if (!configurations.IsArray()) {
+    if (!configurations.is_array()) {
         print_config_error_and_exit(1);
     }
 
-    for (auto& config : configurations.GetArray()) {
+    for (auto& config : configurations) {
         try {
             validate_option_object(config);
             std::clog << "Generating matrix: " << config << std::endl;
-            auto filename = config["filename"].GetString();
-            auto type = config["problem"]["type"].GetString();
+            auto filename = config["filename"].get<std::string>();
+            auto type = config["problem"]["type"].get<std::string>();
             auto mdata = generator[type](config["problem"], engine);
             std::ofstream ofs(filename);
             gko::write_raw(ofs, mdata, gko::layout_type::coordinate);
diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index fccf4391ad5..40c505c7627 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -38,9 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <iostream>
 
 
+#include <ginkgo/core/base/executor.hpp>
+
+
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/types.hpp"
 
 
@@ -51,9 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // See en.wikipedia.org/wiki/Five-number_summary
 // Quartile computation uses Method 3 from en.wikipedia.org/wiki/Quartile
-void compute_summary(const std::vector<gko::size_type>& dist,
-                     rapidjson::Value& out,
-                     rapidjson::MemoryPoolAllocator<>& allocator)
+void compute_summary(const std::vector<gko::size_type>& dist, json& out)
 {
     const auto q = dist.size() / 4;
     const auto r = dist.size() % 4;
@@ -72,23 +73,14 @@ void compute_summary(const std::vector<gko::size_type>& dist,
     };
     // clang-format on
 
-    add_or_set_member(out, "min", dist[0], allocator);
-    add_or_set_member(
-        out, "q1",
-        coefs[r][0] * static_cast<double>(dist[positions[r][0]]) +
-            coefs[r][1] * static_cast<double>(dist[positions[r][1]]),
-        allocator);
-    add_or_set_member(
-        out, "median",
-        coefs[r][2] * static_cast<double>(dist[positions[r][2]]) +
-            coefs[r][3] * static_cast<double>(dist[positions[r][3]]),
-        allocator);
-    add_or_set_member(
-        out, "q3",
-        coefs[r][4] * static_cast<double>(dist[positions[r][4]]) +
-            coefs[r][5] * static_cast<double>(dist[positions[r][5]]),
-        allocator);
-    add_or_set_member(out, "max", dist[dist.size() - 1], allocator);
+    out["min"] = dist.front();
+    out["q1"] = coefs[r][0] * static_cast<double>(dist[positions[r][0]]) +
+                coefs[r][1] * static_cast<double>(dist[positions[r][1]]);
+    out["median"] = coefs[r][2] * static_cast<double>(dist[positions[r][2]]) +
+                    coefs[r][3] * static_cast<double>(dist[positions[r][3]]);
+    out["q3"] = coefs[r][4] * static_cast<double>(dist[positions[r][4]]) +
+                coefs[r][5] * static_cast<double>(dist[positions[r][5]]);
+    out["max"] = dist.back();
 }
 
 
@@ -108,39 +100,30 @@ double compute_moment(int degree, const std::vector<gko::size_type>& dist,
 
 
 // See en.wikipedia.org/wiki/Moment_(mathematics)
-void compute_moments(const std::vector<gko::size_type>& dist,
-                     rapidjson::Value& out,
-                     rapidjson::MemoryPoolAllocator<>& allocator)
+void compute_moments(const std::vector<gko::size_type>& dist, json& out)
 {
     const auto mean = compute_moment(1, dist);
-    add_or_set_member(out, "mean", mean, allocator);
+    out["mean"] = mean;
     const auto variance = compute_moment(2, dist, mean);
-    add_or_set_member(out, "variance", variance, allocator);
+    out["variance"] = variance;
     const auto dev = std::sqrt(variance);
-    add_or_set_member(out, "skewness", compute_moment(3, dist, mean, dev),
-                      allocator);
-    add_or_set_member(out, "kurtosis", compute_moment(4, dist, mean, dev),
-                      allocator);
-    add_or_set_member(out, "hyperskewness", compute_moment(5, dist, mean, dev),
-                      allocator);
-    add_or_set_member(out, "hyperflatness", compute_moment(6, dist, mean, dev),
-                      allocator);
+    out["skewness"] = compute_moment(3, dist, mean, dev);
+    out["kurtosis"] = compute_moment(4, dist, mean, dev);
+    out["hyperskewness"] = compute_moment(5, dist, mean, dev);
+    out["hyperflatness"] = compute_moment(6, dist, mean, dev);
 }
 
 
-template <typename Allocator>
 void compute_distribution_properties(const std::vector<gko::size_type>& dist,
-                                     rapidjson::Value& out,
-                                     Allocator& allocator)
+                                     json& out)
 {
-    compute_summary(dist, out, allocator);
-    compute_moments(dist, out, allocator);
+    compute_summary(dist, out);
+    compute_moments(dist, out);
 }
 
 
-template <typename Allocator>
 void extract_matrix_statistics(gko::matrix_data<etype, gko::int64>& data,
-                               rapidjson::Value& problem, Allocator& allocator)
+                               json& problem)
 {
     std::vector<gko::size_type> row_dist(data.size[0]);
     std::vector<gko::size_type> col_dist(data.size[1]);
@@ -149,72 +132,90 @@ void extract_matrix_statistics(gko::matrix_data<etype, gko::int64>& data,
         ++col_dist[v.column];
     }
 
-    add_or_set_member(problem, "rows", data.size[0], allocator);
-    add_or_set_member(problem, "columns", data.size[1], allocator);
-    add_or_set_member(problem, "nonzeros", data.nonzeros.size(), allocator);
+    problem["rows"] = data.size[0];
+    problem["columns"] = data.size[1];
+    problem["nonzeros"] = data.nonzeros.size();
 
     std::sort(begin(row_dist), end(row_dist));
-    add_or_set_member(problem, "row_distribution",
-                      rapidjson::Value(rapidjson::kObjectType), allocator);
-    compute_distribution_properties(row_dist, problem["row_distribution"],
-                                    allocator);
+    problem["row_distribution"] = json::object();
+    compute_distribution_properties(row_dist, problem["row_distribution"]);
 
     std::sort(begin(col_dist), end(col_dist));
-    add_or_set_member(problem, "col_distribution",
-                      rapidjson::Value(rapidjson::kObjectType), allocator);
-    compute_distribution_properties(col_dist, problem["col_distribution"],
-                                    allocator);
+    problem["col_distribution"] = json::object();
+    compute_distribution_properties(col_dist, problem["col_distribution"]);
 }
 
 
-int main(int argc, char* argv[])
-{
-    std::string header =
-        "A utility that collects additional statistical properties of the "
-        "matrix.\n";
-    std::string format = example_config;
-    initialize_argument_parsing_matrix(&argc, &argv, header, format);
+using Generator = DefaultSystemGenerator<etype, gko::int64>;
 
-    std::clog << gko::version_info::get() << std::endl;
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+struct MatrixStatistics : Benchmark<int> {
+    std::string name;
+    std::vector<std::string> empty;
 
-    auto& allocator = test_cases.GetAllocator();
+    MatrixStatistics() : name{"problem"} {}
 
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("problem")) {
-                test_case.AddMember("problem",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& problem = test_case["problem"];
+    const std::string& get_name() const override { return name; }
 
-            std::clog << "Running test case\n" << test_case << std::endl;
+    const std::vector<std::string>& get_operations() const override
+    {
+        return empty;
+    }
 
-            auto matrix =
-                DefaultSystemGenerator<etype, gko::int64>::generate_matrix_data(
-                    test_case);
+    bool should_print() const override { return true; }
 
-            std::clog << "Matrix is of size (" << matrix.size[0] << ", "
-                      << matrix.size[1] << ")" << std::endl;
-            add_or_set_member(test_case, "size", matrix.size[0], allocator);
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
 
-            extract_matrix_statistics(matrix, test_case["problem"], allocator);
+    bool validate_config(const json& test_case) const override
+    {
+        return Generator::validate_config(test_case);
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
 
-            backup_results(test_cases);
-        } catch (const std::exception& e) {
-            std::cerr << "Error extracting statistics, what(): " << e.what()
-                      << std::endl;
-        }
+    int setup(std::shared_ptr<gko::Executor> exec,
+              json& test_case) const override
+    {
+        auto data = Generator::generate_matrix_data(test_case);
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+
+        extract_matrix_statistics(data, test_case["problem"]);
+        return 0;
     }
 
-    std::cout << test_cases << std::endl;
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             int& data, const std::string& operation_name,
+             json& operation_case) const override
+    {}
+};
+
+
+int main(int argc, char* argv[])
+{
+    std::string header =
+        "A utility that collects additional statistical properties of the "
+        "matrix.\n";
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format);
+
+    std::clog << gko::version_info::get() << std::endl;
+
+    auto test_cases = json::parse(get_input_stream());
+    auto exec = gko::ReferenceExecutor::create();
+
+    run_test_cases(MatrixStatistics{}, exec, get_timer(exec, false),
+                   test_cases);
+
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index e7859e992dc..7c130328d34 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -43,9 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/preconditioners.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 
@@ -128,34 +129,85 @@ std::string encode_parameters(const char* precond_name)
 }
 
 
-void run_preconditioner(const char* precond_name,
-                        std::shared_ptr<gko::Executor> exec,
-                        std::shared_ptr<const gko::LinOp> system_matrix,
-                        const vec<etype>* b, const vec<etype>* x,
-                        rapidjson::Value& test_case,
-                        rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& precond_object = test_case["preconditioner"];
-        auto encoded_name = encode_parameters(precond_name);
+struct preconditioner_benchmark_state {
+    std::unique_ptr<gko::LinOp> x;
+    std::unique_ptr<gko::LinOp> b;
+    std::shared_ptr<const gko::LinOp> system_matrix;
+};
+
+
+using Generator = DefaultSystemGenerator<>;
+
 
-        if (!FLAGS_overwrite &&
-            precond_object.HasMember(encoded_name.c_str())) {
-            return;
+struct PreconditionerBenchmark : Benchmark<preconditioner_benchmark_state> {
+    std::string name;
+    std::vector<std::string> preconditioners;
+    std::map<std::string, std::string> precond_decoder;
+
+    PreconditionerBenchmark()
+        : name{"preconditioner"}, preconditioners{split(FLAGS_preconditioners)}
+    {
+        for (auto precond : split(FLAGS_preconditioners)) {
+            preconditioners.push_back(encode_parameters(precond.c_str()));
+            precond_decoder[preconditioners.back()] = precond;
         }
+    }
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return preconditioners;
+    }
+
+    bool should_print() const override { return true; }
+
+    bool validate_config(const json& value) const override
+    {
+        return Generator::validate_config(value);
+    }
 
-        add_or_set_member(precond_object, encoded_name.c_str(),
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        auto& this_precond_data = precond_object[encoded_name.c_str()];
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
 
-        add_or_set_member(this_precond_data, "generate",
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        add_or_set_member(this_precond_data, "apply",
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+    preconditioner_benchmark_state setup(std::shared_ptr<gko::Executor> exec,
+                                         json& test_case) const override
+    {
+        preconditioner_benchmark_state state;
+        auto data = Generator::generate_matrix_data(test_case);
+
+        state.system_matrix =
+            formats::matrix_factory(FLAGS_formats, exec, data);
+        state.b = Generator::create_multi_vector_random(exec, data.size[0]);
+        state.x = Generator::create_multi_vector(exec, data.size[0],
+                                                 gko::zero<etype>());
+
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+        return state;
+    }
+
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             preconditioner_benchmark_state& state,
+             const std::string& encoded_precond_name,
+             json& precond_case) const override
+    {
+        auto decoded_precond_name = precond_decoder.at(encoded_precond_name);
+        precond_case["generate"] = json::object();
+        precond_case["apply"] = json::object();
         for (auto stage : {"generate", "apply"}) {
-            add_or_set_member(this_precond_data[stage], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
+            precond_case[stage]["components"] = json::object();
         }
 
         IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)};
@@ -163,54 +215,51 @@ void run_preconditioner(const char* precond_name,
 
         {
             // fast run, gets total time
-            auto x_clone = clone(x);
-
-            auto precond = precond_factory.at(precond_name)(exec);
+            auto x_clone = clone(state.x);
 
+            auto precond = precond_factory.at(decoded_precond_name)(exec);
 
             for (auto _ : ic_apply.warmup_run()) {
-                precond->generate(system_matrix)->apply(b, x_clone);
+                precond->generate(state.system_matrix)->apply(state.b, x_clone);
             }
 
             std::unique_ptr<gko::LinOp> precond_op;
             for (auto _ : ic_gen.run()) {
-                precond_op = precond->generate(system_matrix);
+                precond_op = precond->generate(state.system_matrix);
             }
 
-            add_or_set_member(this_precond_data["generate"], "time",
-                              ic_gen.compute_time(FLAGS_timer_method),
-                              allocator);
-            add_or_set_member(this_precond_data["generate"], "repetitions",
-                              ic_gen.get_num_repetitions(), allocator);
+            precond_case["generate"]["time"] =
+                ic_gen.compute_time(FLAGS_timer_method);
+            precond_case["generate"]["repetitions"] =
+                ic_gen.get_num_repetitions();
 
             for (auto _ : ic_apply.run()) {
-                precond_op->apply(b, x_clone);
+                precond_op->apply(state.b, x_clone);
             }
 
-            add_or_set_member(this_precond_data["apply"], "time",
-                              ic_apply.compute_time(FLAGS_timer_method),
-                              allocator);
-            add_or_set_member(this_precond_data["apply"], "repetitions",
-                              ic_apply.get_num_repetitions(), allocator);
+            precond_case["apply"]["time"] =
+                ic_apply.compute_time(FLAGS_timer_method);
+            precond_case["apply"]["repetitions"] =
+                ic_apply.get_num_repetitions();
         }
 
         if (FLAGS_detailed) {
             // slow run, times each component separately
-            auto x_clone = clone(x);
-            auto precond = precond_factory.at(precond_name)(exec);
+            auto x_clone = clone(state.x);
+            auto precond = precond_factory.at(decoded_precond_name)(exec);
 
             std::unique_ptr<gko::LinOp> precond_op;
             {
                 auto gen_logger = create_operations_logger(
                     FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                    this_precond_data["generate"]["components"], allocator,
+                    precond_case["generate"]["components"],
                     ic_gen.get_num_repetitions());
                 exec->add_logger(gen_logger);
                 if (exec->get_master() != exec) {
                     exec->get_master()->add_logger(gen_logger);
                 }
                 for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) {
-                    precond_op = precond->generate(system_matrix);
+                    precond_op = precond->generate(state.system_matrix);
                 }
                 if (exec->get_master() != exec) {
                     exec->get_master()->remove_logger(gen_logger);
@@ -220,39 +269,22 @@ void run_preconditioner(const char* precond_name,
 
             auto apply_logger = create_operations_logger(
                 FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                this_precond_data["apply"]["components"], allocator,
+                precond_case["apply"]["components"],
                 ic_apply.get_num_repetitions());
             exec->add_logger(apply_logger);
             if (exec->get_master() != exec) {
                 exec->get_master()->add_logger(apply_logger);
             }
             for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) {
-                precond_op->apply(b, x_clone);
+                precond_op->apply(state.b, x_clone);
             }
             if (exec->get_master() != exec) {
                 exec->get_master()->remove_logger(apply_logger);
             }
             exec->remove_logger(apply_logger);
         }
-
-        add_or_set_member(this_precond_data, "completed", true, allocator);
-    } catch (const std::exception& e) {
-        auto encoded_name = encode_parameters(precond_name);
-        add_or_set_member(test_case["preconditioner"], encoded_name.c_str(),
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        add_or_set_member(test_case["preconditioner"][encoded_name.c_str()],
-                          "completed", false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["preconditioner"][encoded_name.c_str()],
-                              "error", msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case\n"
-                  << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
     }
-}
+};
 
 
 int main(int argc, char* argv[])
@@ -261,11 +293,11 @@ int main(int argc, char* argv[])
     FLAGS_formats = "csr";
     std::string header =
         "A benchmark for measuring preconditioner performance.\n";
-    std::string format = example_config;
+    std::string format = Generator::get_example_config();
     initialize_argument_parsing_matrix(&argc, &argv, header, format);
 
     std::string extra_information =
-        "Running with preconditioners: " + FLAGS_preconditioners + "\n";
+        "Running with preconditioners: " + FLAGS_preconditioners;
     print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
@@ -279,76 +311,10 @@ int main(int argc, char* argv[])
         std::exit(1);
     }
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(get_input_stream());
 
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-    DefaultSystemGenerator<> generator{};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("preconditioner")) {
-                test_case.AddMember("preconditioner",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& precond_object = test_case["preconditioner"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(preconditioners), end(preconditioners),
-                       [&precond_object](const std::string& s) {
-                           return precond_object.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            std::clog << "Running test case\n" << test_case << std::endl;
-
-            // annotate the test case
-            auto test_case_range =
-                annotate(generator.describe_config(test_case));
-
-            auto data = generator.generate_matrix_data(test_case);
-
-            auto system_matrix =
-                share(formats::matrix_factory(FLAGS_formats, exec, data));
-            auto b = generator.create_multi_vector_random(
-                exec, system_matrix->get_size()[0]);
-            auto x = generator.create_multi_vector(
-                exec, system_matrix->get_size()[0], gko::zero<etype>());
-
-            std::clog << "Matrix is of size (" << system_matrix->get_size()[0]
-                      << ", " << system_matrix->get_size()[1] << ")"
-                      << std::endl;
-            add_or_set_member(test_case, "size", data.size[0], allocator);
-            for (const auto& precond_name : preconditioners) {
-                {
-                    auto precond_range = annotate(precond_name.c_str());
-                    run_preconditioner(precond_name.c_str(), exec,
-                                       system_matrix, b.get(), x.get(),
-                                       test_case, allocator);
-                }
-                std::clog << "Current state:" << std::endl
-                          << test_cases << std::endl;
-                backup_results(test_cases);
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up preconditioner, what(): " << e.what()
-                      << std::endl;
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
+    run_test_cases(PreconditionerBenchmark{}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp
index a9b1f9c1c93..d691309ab6a 100644
--- a/benchmark/solver/distributed/solver.cpp
+++ b/benchmark/solver/distributed/solver.cpp
@@ -52,7 +52,7 @@ struct Generator : public DistributedDefaultSystemGenerator<SolverGenerator> {
 
     std::unique_ptr<Vec> generate_rhs(std::shared_ptr<const gko::Executor> exec,
                                       const gko::LinOp* system_matrix,
-                                      rapidjson::Value& config) const
+                                      json& config) const
     {
         return Vec::create(
             exec, comm, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs},
@@ -82,9 +82,13 @@ int main(int argc, char* argv[])
     FLAGS_repetitions = "1";
     FLAGS_min_repetitions = 1;
 
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+    const auto do_print = rank == 0;
+
     std::string header =
         "A benchmark for measuring Ginkgo's distributed solvers\n";
-    std::string format = example_config + R"(
+    std::string format = solver_example_config + R"(
   The matrix will either be read from an input file if the filename parameter
   is given, or generated as a stencil matrix.
   If the filename parameter is given, all processes will read the file and
@@ -100,10 +104,7 @@ int main(int argc, char* argv[])
 )";
     std::string additional_json = R"(,"optimal":{"spmv":"csr-csr"})";
     initialize_argument_parsing_matrix(&argc, &argv, header, format,
-                                       additional_json);
-
-    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    const auto rank = comm.rank();
+                                       additional_json, do_print);
 
     auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
 
@@ -114,8 +115,8 @@ int main(int argc, char* argv[])
         "Running " + FLAGS_solvers + " with " +
         std::to_string(FLAGS_max_iters) + " iterations and residual goal of " +
         ss_rel_res_goal.str() + "\nThe number of right hand sides is " +
-        std::to_string(FLAGS_nrhs) + "\n";
-    if (rank == 0) {
+        std::to_string(FLAGS_nrhs);
+    if (do_print) {
         print_general_information(extra_information);
     }
 
@@ -136,17 +137,12 @@ int main(int argc, char* argv[])
   "optimal": {"spmv": "csr-csr"}]
 )"
                        : broadcast_json_input(get_input_stream(), comm);
-    rapidjson::Document test_cases;
-    test_cases.Parse(json_input.c_str());
-
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(json_input);
 
-    run_solver_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer),
-                          test_cases, Generator(comm), rank == 0);
+    run_test_cases(SolverBenchmark<Generator>{Generator{comm}}, exec,
+                   get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases);
 
-    if (rank == 0) {
-        std::cout << test_cases << std::endl;
+    if (do_print) {
+        std::cout << std::setw(4) << test_cases << std::endl;
     }
 }
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index 4efc5558a8e..b656102e5df 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -58,7 +58,7 @@ int main(int argc, char* argv[])
     FLAGS_min_repetitions = 1;
     std::string header =
         "A benchmark for measuring performance of Ginkgo's solvers.\n";
-    std::string format = example_config + R"(
+    std::string format = solver_example_config + R"(
   "optimal":"spmv" can be one of the recognized spmv formats
 )";
     std::string additional_json = R"(,"optimal":{"spmv":"csr"})";
@@ -72,29 +72,24 @@ int main(int argc, char* argv[])
         "Running " + FLAGS_solvers + " with " +
         std::to_string(FLAGS_max_iters) + " iterations and residual goal of " +
         ss_rel_res_goal.str() + "\nThe number of right hand sides is " +
-        std::to_string(FLAGS_nrhs) + "\n";
+        std::to_string(FLAGS_nrhs);
     print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
 
-    rapidjson::Document test_cases;
+    json test_cases;
     if (!FLAGS_overhead) {
-        rapidjson::IStreamWrapper jcin(get_input_stream());
-        test_cases.ParseStream(jcin);
+        test_cases = json::parse(get_input_stream());
     } else {
         // Fake test case to run once
         auto overhead_json = std::string() +
                              " [{\"filename\": \"overhead.mtx\", \"optimal\": "
                              "{ \"spmv\": \"csr\"}}]";
-        test_cases.Parse(overhead_json.c_str());
+        test_cases = json::parse(overhead_json);
     }
 
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
-
-    run_solver_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), test_cases,
-                          SolverGenerator{}, true);
+    run_test_cases(SolverBenchmark<SolverGenerator>{SolverGenerator{}}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp
index ae9ae6dc1fb..4976e5759d4 100644
--- a/benchmark/solver/solver_common.hpp
+++ b/benchmark/solver/solver_common.hpp
@@ -37,8 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/preconditioners.hpp"
+#include "benchmark/utils/runner.hpp"
 
 
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
@@ -107,7 +109,7 @@ DEFINE_bool(overhead, false,
             "If set, uses dummy data to benchmark Ginkgo overhead");
 
 
-std::string example_config = R"(
+std::string solver_example_config = R"(
   [
     {"filename": "my_file.mtx", "optimal": {"spmv": "ell-csr"},
      "rhs": "my_file_rhs.mtx"},
@@ -119,28 +121,6 @@ std::string example_config = R"(
 )";
 
 
-// input validation
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of solver configurations:\n"
-              << example_config << std::endl;
-    std::exit(1);
-}
-
-
-void validate_option_object(const rapidjson::Value& value)
-{
-    if (!value.IsObject() ||
-        !((value.HasMember("size") && value.HasMember("stencil") &&
-           value["size"].IsInt64() && value["stencil"].IsString()) ||
-          (value.HasMember("filename") && value["filename"].IsString())) ||
-        (!value.HasMember("optimal") && !value["optimal"].HasMember("spmv") &&
-         !value["optimal"]["spmv"].IsString())) {
-        print_config_error_and_exit();
-    }
-}
-
-
 std::shared_ptr<const gko::stop::CriterionFactory> create_criterion(
     std::shared_ptr<const gko::Executor> exec, std::uint32_t max_iters)
 {
@@ -284,21 +264,17 @@ std::unique_ptr<gko::LinOpFactory> generate_solver(
 }
 
 
-void write_precond_info(const gko::LinOp* precond,
-                        rapidjson::Value& precond_info,
-                        rapidjson::MemoryPoolAllocator<>& allocator)
+void write_precond_info(const gko::LinOp* precond, json& precond_info)
 {
     if (const auto jacobi =
             dynamic_cast<const gko::preconditioner::Jacobi<etype>*>(precond)) {
         // extract block sizes
         const auto bdata =
             jacobi->get_parameters().block_pointers.get_const_data();
-        add_or_set_member(precond_info, "block_sizes",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        precond_info["block_sizes"] = json::array();
         const auto nblocks = jacobi->get_num_blocks();
         for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-            precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i],
-                                                 allocator);
+            precond_info["block_sizes"].push_back(bdata[i + 1] - bdata[i]);
         }
 
         // extract block precisions
@@ -306,24 +282,19 @@ void write_precond_info(const gko::LinOp* precond,
             jacobi->get_parameters()
                 .storage_optimization.block_wise.get_const_data();
         if (pdata) {
-            add_or_set_member(precond_info, "block_precisions",
-                              rapidjson::Value(rapidjson::kArrayType),
-                              allocator);
+            precond_info["block_precisions"] = json::array();
             for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-                precond_info["block_precisions"].PushBack(
-                    static_cast<int>(pdata[i]), allocator);
+                precond_info["block_precisions"].push_back(
+                    static_cast<int>(pdata[i]));
             }
         }
 
         // extract condition numbers
         const auto cdata = jacobi->get_conditioning();
         if (cdata) {
-            add_or_set_member(precond_info, "block_conditioning",
-                              rapidjson::Value(rapidjson::kArrayType),
-                              allocator);
+            precond_info["block_conditioning"] = json::array();
             for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-                precond_info["block_conditioning"].PushBack(cdata[i],
-                                                            allocator);
+                precond_info["block_conditioning"].push_back(cdata[i]);
             }
         }
     }
@@ -335,10 +306,10 @@ struct SolverGenerator : DefaultSystemGenerator<> {
 
     std::unique_ptr<Vec> generate_rhs(std::shared_ptr<const gko::Executor> exec,
                                       const gko::LinOp* system_matrix,
-                                      rapidjson::Value& config) const
+                                      json& config) const
     {
-        if (config.HasMember("rhs")) {
-            std::ifstream rhs_fd{config["rhs"].GetString()};
+        if (config.contains("rhs")) {
+            std::ifstream rhs_fd{config["rhs"].get<std::string>()};
             return gko::read<Vec>(rhs_fd, std::move(exec));
         } else {
             gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs};
@@ -399,45 +370,112 @@ struct SolverGenerator : DefaultSystemGenerator<> {
 };
 
 
-template <typename VectorType>
-void solve_system(const std::string& solver_name,
-                  const std::string& precond_name,
-                  const char* precond_solver_name,
-                  std::shared_ptr<gko::Executor> exec,
-                  std::shared_ptr<Timer> timer,
-                  std::shared_ptr<const gko::LinOp> system_matrix,
-                  const VectorType* b, const VectorType* x,
-                  rapidjson::Value& test_case,
-                  rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& solver_case = test_case["solver"];
-        if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) {
-            return;
+template <typename Generator>
+struct solver_benchmark_state {
+    using Vec = typename Generator::Vec;
+    std::shared_ptr<gko::LinOp> system_matrix;
+    std::unique_ptr<Vec> b;
+    std::unique_ptr<Vec> x;
+};
+
+
+template <typename Generator>
+struct SolverBenchmark : Benchmark<solver_benchmark_state<Generator>> {
+    std::string name;
+    std::vector<std::string> precond_solvers;
+    std::map<std::string, std::pair<std::string, std::string>> decoder;
+    Generator generator;
+
+    SolverBenchmark(Generator generator) : name{"solver"}, generator{generator}
+    {
+        auto solvers = split(FLAGS_solvers, ',');
+        auto preconds = split(FLAGS_preconditioners, ',');
+        for (const auto& s : solvers) {
+            for (const auto& p : preconds) {
+                precond_solvers.push_back(s + (p == "none" ? "" : "-" + p));
+                decoder[precond_solvers.back()] = {s, p};
+            }
+        }
+    }
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return precond_solvers;
+    }
+
+    bool should_print() const override { return true; }
+
+    std::string get_example_config() const override
+    {
+        return solver_example_config;
+    }
+
+    bool validate_config(const json& value) const override
+    {
+        return ((value.contains("size") && value.contains("stencil") &&
+                 value["size"].is_number_integer() &&
+                 value["stencil"].is_string()) ||
+                (value.contains("filename") &&
+                 value["filename"].is_string())) &&
+               (value.contains("optimal") &&
+                value["optimal"].contains("spmv") &&
+                value["optimal"]["spmv"].is_string());
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    solver_benchmark_state<Generator> setup(std::shared_ptr<gko::Executor> exec,
+                                            json& test_case) const override
+    {
+        solver_benchmark_state<Generator> state;
+
+        if (FLAGS_overhead) {
+            state.system_matrix = generator.initialize({1.0}, exec);
+            state.b = generator.initialize(
+                {std::numeric_limits<rc_etype>::quiet_NaN()}, exec);
+            state.x = generator.initialize({0.0}, exec);
+        } else {
+            state.system_matrix =
+                generator.generate_matrix_with_optimal_format(exec, test_case);
+            state.b = generator.generate_rhs(exec, state.system_matrix.get(),
+                                             test_case);
+            state.x = generator.generate_initial_guess(
+                exec, state.system_matrix.get(), state.b.get());
         }
 
-        add_or_set_member(solver_case, precond_solver_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        auto& solver_json = solver_case[precond_solver_name];
-        add_or_set_member(solver_json, "recurrent_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "true_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "implicit_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "iteration_timestamps",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
-            auto rhs_norm = compute_norm2(b);
-            add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator);
+        std::clog << "Matrix is of size (" << state.system_matrix->get_size()[0]
+                  << ", " << state.system_matrix->get_size()[1] << ")"
+                  << std::endl;
+        test_case["rows"] = state.system_matrix->get_size()[0];
+        test_case["cols"] = state.system_matrix->get_size()[1];
+        return state;
+    }
+
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             solver_benchmark_state<Generator>& state,
+             const std::string& encoded_solver_name,
+             json& solver_case) const override
+    {
+        const auto decoded_pair = decoder.at(encoded_solver_name);
+        auto& solver_name = decoded_pair.first;
+        auto& precond_name = decoded_pair.second;
+        solver_case["recurrent_residuals"] = json::array();
+        solver_case["true_residuals"] = json::array();
+        solver_case["implicit_residuals"] = json::array();
+        solver_case["iteration_timestamps"] = json::array();
+        if (state.b->get_size()[1] == 1 && !FLAGS_overhead) {
+            auto rhs_norm = compute_norm2(state.b.get());
+            solver_case["rhs_norm"] = rhs_norm;
         }
         for (auto stage : {"generate", "apply"}) {
-            add_or_set_member(solver_json, stage,
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
-            add_or_set_member(solver_json[stage], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
+            solver_case[stage] = json::object();
+            solver_case[stage]["components"] = json::object();
         }
 
         IterationControl ic{timer};
@@ -445,24 +483,24 @@ void solve_system(const std::string& solver_name,
         // warm run
         std::shared_ptr<gko::LinOp> solver;
         for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
             auto precond = precond_factory.at(precond_name)(exec);
             solver = generate_solver(exec, give(precond), solver_name,
                                      FLAGS_warmup_max_iters)
-                         ->generate(system_matrix);
-            solver->apply(b, x_clone);
+                         ->generate(state.system_matrix);
+            solver->apply(state.b, x_clone);
             exec->synchronize();
         }
 
         // detail run
         if (FLAGS_detailed && !FLAGS_overhead) {
             // slow run, get the time of each functions
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
 
             {
                 auto gen_logger = create_operations_logger(
                     FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                    solver_json["generate"]["components"], allocator, 1);
+                    solver_case["generate"]["components"], 1);
                 exec->add_logger(gen_logger);
                 if (exec != exec->get_master()) {
                     exec->get_master()->add_logger(gen_logger);
@@ -471,7 +509,7 @@ void solve_system(const std::string& solver_name,
                 auto precond = precond_factory.at(precond_name)(exec);
                 solver = generate_solver(exec, give(precond), solver_name,
                                          FLAGS_max_iters)
-                             ->generate(system_matrix);
+                             ->generate(state.system_matrix);
 
                 exec->remove_logger(gen_logger);
                 if (exec != exec->get_master()) {
@@ -481,25 +519,22 @@ void solve_system(const std::string& solver_name,
 
             if (auto prec =
                     dynamic_cast<const gko::Preconditionable*>(solver.get())) {
-                add_or_set_member(solver_json, "preconditioner",
-                                  rapidjson::Value(rapidjson::kObjectType),
-                                  allocator);
+                solver_case["preconditioner"] = json::object();
                 write_precond_info(
                     clone(exec->get_master(), prec->get_preconditioner()).get(),
-                    solver_json["preconditioner"], allocator);
+                    solver_case["preconditioner"]);
             }
 
             {
                 auto apply_logger = create_operations_logger(
                     FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                    solver_json["apply"]["components"], allocator, 1);
+                    solver_case["apply"]["components"], 1);
                 exec->add_logger(apply_logger);
                 if (exec != exec->get_master()) {
                     exec->get_master()->add_logger(apply_logger);
                 }
 
-
-                solver->apply(b, x_clone);
+                solver->apply(state.b, x_clone);
 
                 exec->remove_logger(apply_logger);
                 if (exec != exec->get_master()) {
@@ -508,17 +543,18 @@ void solve_system(const std::string& solver_name,
             }
 
             // slow run, gets the recurrent and true residuals of each iteration
-            if (b->get_size()[1] == 1) {
-                x_clone = clone(x);
+            if (state.b->get_size()[1] == 1) {
+                x_clone = clone(state.x);
                 auto res_logger = std::make_shared<ResidualLogger<etype>>(
-                    system_matrix, b, solver_json["recurrent_residuals"],
-                    solver_json["true_residuals"],
-                    solver_json["implicit_residuals"],
-                    solver_json["iteration_timestamps"], allocator);
+                    state.system_matrix, state.b,
+                    solver_case["recurrent_residuals"],
+                    solver_case["true_residuals"],
+                    solver_case["implicit_residuals"],
+                    solver_case["iteration_timestamps"]);
                 solver->add_logger(res_logger);
-                solver->apply(b, x_clone);
+                solver->apply(state.b, x_clone);
                 if (!res_logger->has_implicit_res_norms()) {
-                    solver_json.RemoveMember("implicit_residuals");
+                    solver_case.erase("implicit_residuals");
                 }
             }
             exec->synchronize();
@@ -528,16 +564,16 @@ void solve_system(const std::string& solver_name,
         auto it_logger = std::make_shared<IterationLogger>();
         auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
         auto apply_timer = ic.get_timer();
-        auto x_clone = clone(x);
+        auto x_clone = clone(state.x);
         for (auto status : ic.run(false)) {
-            x_clone = clone(x);
+            x_clone = clone(state.x);
 
             exec->synchronize();
             generate_timer->tic();
             auto precond = precond_factory.at(precond_name)(exec);
             solver = generate_solver(exec, give(precond), solver_name,
                                      FLAGS_max_iters)
-                         ->generate(system_matrix);
+                         ->generate(state.system_matrix);
             generate_timer->toc();
 
             exec->synchronize();
@@ -545,165 +581,33 @@ void solve_system(const std::string& solver_name,
                 solver->add_logger(it_logger);
             }
             apply_timer->tic();
-            solver->apply(b, x_clone);
+            solver->apply(state.b, x_clone);
             apply_timer->toc();
             if (ic.get_num_repetitions() == 0) {
                 solver->remove_logger(it_logger);
             }
         }
-        it_logger->write_data(solver_json["apply"], allocator);
+        it_logger->write_data(solver_case["apply"]);
 
-        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
+        if (state.b->get_size()[1] == 1 && !FLAGS_overhead) {
             // a solver is considered direct if it didn't log any iterations
-            if (solver_json["apply"].HasMember("iterations") &&
-                solver_json["apply"]["iterations"].GetInt() == 0) {
-                auto error =
-                    compute_direct_error(solver.get(), b, x_clone.get());
-                add_or_set_member(solver_json, "forward_error", error,
-                                  allocator);
-            }
-            auto residual =
-                compute_residual_norm(system_matrix.get(), b, x_clone.get());
-            add_or_set_member(solver_json, "residual_norm", residual,
-                              allocator);
-        }
-        add_or_set_member(solver_json["generate"], "time",
-                          generate_timer->compute_time(FLAGS_timer_method),
-                          allocator);
-        add_or_set_member(solver_json["apply"], "time",
-                          apply_timer->compute_time(FLAGS_timer_method),
-                          allocator);
-        add_or_set_member(solver_json, "repetitions",
-                          apply_timer->get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(solver_json, "completed", true, allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["solver"][precond_solver_name], "completed",
-                          false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["solver"][precond_solver_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case\n"
-                  << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
-template <typename SystemGenerator>
-void run_solver_benchmarks(std::shared_ptr<gko::Executor> exec,
-                           std::shared_ptr<Timer> timer,
-                           rapidjson::Document& test_cases,
-                           const SystemGenerator& system_generator,
-                           bool do_print)
-{
-    auto solvers = split(FLAGS_solvers, ',');
-    auto preconds = split(FLAGS_preconditioners, ',');
-    std::vector<std::string> precond_solvers;
-    for (const auto& s : solvers) {
-        for (const auto& p : preconds) {
-            precond_solvers.push_back(s + (p == "none" ? "" : "-" + p));
-        }
-    }
-
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("solver")) {
-                test_case.AddMember("solver",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& solver_case = test_case["solver"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(precond_solvers), end(precond_solvers),
-                       [&solver_case](const std::string& s) {
-                           return solver_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            // annotate the test case
-            auto test_case_range =
-                annotate(system_generator.describe_config(test_case));
-
-            if (do_print) {
-                std::clog << "Running test case\n" << test_case << std::endl;
-            }
-
-            using Vec = typename SystemGenerator::Vec;
-            std::shared_ptr<gko::LinOp> system_matrix;
-            std::unique_ptr<Vec> b;
-            std::unique_ptr<Vec> x;
-            if (FLAGS_overhead) {
-                system_matrix = system_generator.initialize({1.0}, exec);
-                b = system_generator.initialize(
-                    {std::numeric_limits<rc_etype>::quiet_NaN()}, exec);
-                x = system_generator.initialize({0.0}, exec);
-            } else {
-                system_matrix =
-                    system_generator.generate_matrix_with_optimal_format(
-                        exec, test_case);
-                b = system_generator.generate_rhs(exec, system_matrix.get(),
-                                                  test_case);
-                x = system_generator.generate_initial_guess(
-                    exec, system_matrix.get(), b.get());
-            }
-
-            if (do_print) {
-                std::clog << "Matrix is of size ("
-                          << system_matrix->get_size()[0] << ", "
-                          << system_matrix->get_size()[1] << ")" << std::endl;
-            }
-            add_or_set_member(test_case, "size", system_matrix->get_size()[0],
-                              allocator);
-            auto precond_solver_name = begin(precond_solvers);
-            for (const auto& solver_name : solvers) {
-                auto solver_range = annotate(solver_name.c_str());
-                for (const auto& precond_name : preconds) {
-                    if (do_print) {
-                        std::clog
-                            << "\tRunning solver: " << *precond_solver_name
-                            << std::endl;
-                    }
-                    {
-                        auto precond_range = annotate(precond_name.c_str());
-                        solve_system(solver_name, precond_name,
-                                     precond_solver_name->c_str(), exec, timer,
-                                     system_matrix, b.get(), x.get(), test_case,
-                                     allocator);
-                    }
-                    if (do_print) {
-                        backup_results(test_cases);
-                    }
-                    ++precond_solver_name;
-                }
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up solver, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
+            if (solver_case["apply"].contains("iterations") &&
+                solver_case["apply"]["iterations"].get<gko::int64>() == 0) {
+                auto error = compute_direct_error(solver.get(), state.b.get(),
+                                                  x_clone.get());
+                solver_case["forward_error"] = error;
             }
+            auto residual = compute_residual_norm(state.system_matrix.get(),
+                                                  state.b.get(), x_clone.get());
+            solver_case["residual_norm"] = residual;
         }
+        solver_case["generate"]["time"] =
+            generate_timer->compute_time(FLAGS_timer_method);
+        solver_case["apply"]["time"] =
+            apply_timer->compute_time(FLAGS_timer_method);
+        solver_case["repetitions"] = apply_timer->get_num_repetitions();
     }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-}
+};
 
 
 #endif  // GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP
diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp
index 66e5707c559..2ee766d4f83 100644
--- a/benchmark/sparse_blas/operations.cpp
+++ b/benchmark/sparse_blas/operations.cpp
@@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/sparse_blas/operations.hpp"
-#include "benchmark/utils/json.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
@@ -632,11 +631,9 @@ class SymbolicLuOperation : public BenchmarkOperation {
 
     void run() override { gko::factorization::symbolic_lu(mtx_, result_); }
 
-    void write_stats(rapidjson::Value& object,
-                     rapidjson::MemoryPoolAllocator<>& allocator) override
+    void write_stats(json& object) override
     {
-        add_or_set_member(object, "factor_nonzeros",
-                          result_->get_num_stored_elements(), allocator);
+        object["factor_nonzeros"] = result_->get_num_stored_elements();
     }
 
 private:
@@ -680,11 +677,9 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
                                               forest_);
     }
 
-    void write_stats(rapidjson::Value& object,
-                     rapidjson::MemoryPoolAllocator<>& allocator) override
+    void write_stats(json& object) override
     {
-        add_or_set_member(object, "factor_nonzeros",
-                          result_->get_num_stored_elements(), allocator);
+        object["factor_nonzeros"] = result_->get_num_stored_elements();
     }
 
 private:
diff --git a/benchmark/sparse_blas/operations.hpp b/benchmark/sparse_blas/operations.hpp
index 99cf72b8e59..48034eb8a1f 100644
--- a/benchmark/sparse_blas/operations.hpp
+++ b/benchmark/sparse_blas/operations.hpp
@@ -36,9 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <tuple>
 
 
-#include <rapidjson/document.h>
-
-
+#include "benchmark/utils/json.hpp"
 #include "benchmark/utils/types.hpp"
 
 
@@ -79,9 +77,7 @@ class BenchmarkOperation {
     /**
      * Allows the operation to write arbitrary information to the JSON output.
      */
-    virtual void write_stats(rapidjson::Value& object,
-                             rapidjson::MemoryPoolAllocator<>& allocator)
-    {}
+    virtual void write_stats(json& object) {}
 };
 
 
diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index 8c054709fdf..21df4d9c448 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -47,7 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/sparse_blas/operations.hpp"
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
+#include "benchmark/utils/iteration_control.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/types.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 
@@ -74,18 +75,64 @@ DEFINE_bool(validate, false,
             "against the ReferenceExecutor solution.");
 
 
-void apply_sparse_blas(const char* operation_name,
-                       std::shared_ptr<gko::Executor> exec, const Mtx* mtx,
-                       rapidjson::Value& test_case,
-                       rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        add_or_set_member(test_case, operation_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+using Generator = DefaultSystemGenerator<>;
+
+
+struct SparseBlasBenchmark : Benchmark<std::unique_ptr<Mtx>> {
+    std::string name;
+    std::vector<std::string> operations;
+
+    SparseBlasBenchmark()
+        : name{"sparse_blas"}, operations{split(FLAGS_operations)}
+    {}
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return operations;
+    }
+
+    bool should_print() const override { return true; }
+
+    bool validate_config(const json& value) const override
+    {
+        return Generator::validate_config(value);
+    }
+
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    std::unique_ptr<Mtx> setup(std::shared_ptr<gko::Executor> exec,
+                               json& test_case) const override
+    {
+        auto data = Generator::generate_matrix_data(test_case);
+        data.ensure_row_major_order();
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+
+        auto mtx = Mtx::create(exec, data.size, data.nonzeros.size());
+        mtx->read(data);
+        return mtx;
+    }
+
 
-        auto op = get_operation(operation_name, mtx);
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             std::unique_ptr<Mtx>& mtx, const std::string& operation_name,
+             json& operation_case) const override
+    {
+        auto op = get_operation(operation_name, mtx.get());
 
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
         IterationControl ic(timer);
 
         // warm run
@@ -105,54 +152,30 @@ void apply_sparse_blas(const char* operation_name,
         const auto flops = static_cast<double>(op->get_flops());
         const auto mem = static_cast<double>(op->get_memory());
         const auto repetitions = ic.get_num_repetitions();
-        add_or_set_member(test_case[operation_name], "time", runtime,
-                          allocator);
-        add_or_set_member(test_case[operation_name], "flops", flops / runtime,
-                          allocator);
-        add_or_set_member(test_case[operation_name], "bandwidth", mem / runtime,
-                          allocator);
-        add_or_set_member(test_case[operation_name], "repetitions", repetitions,
-                          allocator);
+        operation_case["time"] = runtime;
+        operation_case["flops"] = flops / runtime;
+        operation_case["bandwidth"] = mem / runtime;
+        operation_case["repetitions"] = repetitions;
 
         if (FLAGS_validate) {
             auto validation_result = op->validate();
-            add_or_set_member(test_case[operation_name], "correct",
-                              validation_result.first, allocator);
-            add_or_set_member(test_case[operation_name], "error",
-                              validation_result.second, allocator);
+            operation_case["correct"] = validation_result.first;
+            operation_case["error"] = validation_result.second;
         }
         if (FLAGS_detailed) {
-            add_or_set_member(test_case[operation_name], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
+            operation_case["components"] = json::object();
             auto gen_logger = create_operations_logger(
                 FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                test_case[operation_name]["components"], allocator,
-                repetitions);
+                operation_case["components"], repetitions);
             exec->add_logger(gen_logger);
             for (unsigned i = 0; i < repetitions; i++) {
                 op->run();
             }
             exec->remove_logger(gen_logger);
         }
-        op->write_stats(test_case[operation_name], allocator);
-
-        add_or_set_member(test_case[operation_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case[operation_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case[operation_name], "error", msg_value,
-                              allocator);
-        }
-        std::cerr << "Error when processing test case\n"
-                  << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
+        op->write_stats(operation_case);
     }
-}
+};
 
 
 int main(int argc, char* argv[])
@@ -160,86 +183,18 @@ int main(int argc, char* argv[])
     std::string header =
         "A benchmark for measuring performance of Ginkgo's sparse BLAS "
         "operations.\n";
-    std::string format = example_config;
+    std::string format = Generator::get_example_config();
     initialize_argument_parsing_matrix(&argc, &argv, header, format);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(get_input_stream());
 
     std::string extra_information = "The operations are " + FLAGS_operations;
     print_general_information(extra_information);
 
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    auto operations = split(FLAGS_operations, ',');
-
-    DefaultSystemGenerator<> generator{};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember(benchmark_name)) {
-                test_case.AddMember(rapidjson::Value(benchmark_name, allocator),
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& sp_blas_case = test_case[benchmark_name];
-            std::clog << "Running test case\n" << test_case << std::endl;
-            auto data = generator.generate_matrix_data(test_case);
-            data.ensure_row_major_order();
-            std::clog << "Matrix is of size (" << data.size[0] << ", "
-                      << data.size[1] << "), " << data.nonzeros.size()
-                      << std::endl;
-            add_or_set_member(test_case, "rows", data.size[0], allocator);
-            add_or_set_member(test_case, "cols", data.size[1], allocator);
-            add_or_set_member(test_case, "nonzeros", data.nonzeros.size(),
-                              allocator);
-
-            auto mtx = Mtx::create(exec, data.size, data.nonzeros.size());
-            mtx->read(data);
-            // annotate the test case
-            auto test_case_range =
-                annotate(generator.describe_config(test_case));
-            for (const auto& operation_name : operations) {
-                if (FLAGS_overwrite ||
-                    !sp_blas_case.HasMember(operation_name.c_str())) {
-                    {
-                        auto operation_range = annotate(operation_name.c_str());
-                        apply_sparse_blas(operation_name.c_str(), exec,
-                                          mtx.get(), sp_blas_case, allocator);
-                    }
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-                    backup_results(test_cases);
-                }
-            }
-            // write the output if we have no strategies
-            backup_results(test_cases);
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
-            }
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
+    run_test_cases(SparseBlasBenchmark{}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp
index 9b7e4ad8c8f..202aad15c7e 100644
--- a/benchmark/spmv/distributed/spmv.cpp
+++ b/benchmark/spmv/distributed/spmv.cpp
@@ -58,38 +58,7 @@ DEFINE_string(non_local_formats, "csr",
               "run. See the 'formats' option for a list of supported versions");
 
 
-std::string example_config = R"(
-  [
-    {"size": 100, "stencil": "7pt", "comm_pattern": "stencil"},
-    {"filename": "my_file.mtx"}
-  ]
-)";
-
-
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of matrix configurations:\n"
-              << example_config << std::endl;
-    std::exit(1);
-}
-
-
-struct Generator : DistributedDefaultSystemGenerator<DefaultSystemGenerator<>> {
-    Generator(gko::experimental::mpi::communicator comm)
-        : DistributedDefaultSystemGenerator<DefaultSystemGenerator<>>{
-              std::move(comm), {}}
-    {}
-
-    void validate_options(const rapidjson::Value& options) const
-    {
-        if (!options.IsObject() ||
-            !((options.HasMember("size") && options.HasMember("stencil") &&
-               options.HasMember("comm_pattern")) ||
-              options.HasMember("filename"))) {
-            print_config_error_and_exit();
-        }
-    }
-};
+using Generator = DistributedDefaultSystemGenerator<DefaultSystemGenerator<>>;
 
 
 int main(int argc, char* argv[])
@@ -98,18 +67,19 @@ int main(int argc, char* argv[])
 
     const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     const auto rank = comm.rank();
+    const auto do_print = rank == 0;
 
     std::string header =
         "A benchmark for measuring performance of Ginkgo's spmv.\n";
-    std::string format = example_config;
-    initialize_argument_parsing_matrix(&argc, &argv, header, format);
-
-    if (rank == 0) {
-        std::string extra_information = "The formats are [" +
-                                        FLAGS_local_formats + "]x[" +
-                                        FLAGS_non_local_formats + "]\n" +
-                                        "The number of right hand sides is " +
-                                        std::to_string(FLAGS_nrhs) + "\n";
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format, "",
+                                       do_print);
+
+    if (do_print) {
+        std::string extra_information =
+            "The formats are [" + FLAGS_local_formats + "]x[" +
+            FLAGS_non_local_formats + "]\n" +
+            "The number of right hand sides is " + std::to_string(FLAGS_nrhs);
         print_general_information(extra_information);
     }
 
@@ -125,16 +95,13 @@ int main(int argc, char* argv[])
     }
 
     std::string json_input = broadcast_json_input(get_input_stream(), comm);
-    rapidjson::Document test_cases;
-    test_cases.Parse(json_input.c_str());
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(json_input);
 
-    run_spmv_benchmark(exec, test_cases, formats, Generator{comm},
-                       get_mpi_timer(exec, comm, FLAGS_gpu_timer), rank == 0);
+    run_test_cases(SpmvBenchmark<Generator>{Generator{comm}, formats, do_print},
+                   exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer),
+                   test_cases);
 
-    if (rank == 0) {
-        std::cout << test_cases << std::endl;
+    if (do_print) {
+        std::cout << std::setw(4) << test_cases << std::endl;
     }
 }
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index 034437907c8..abd1b783019 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -41,48 +41,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
 
 
-struct Generator : DefaultSystemGenerator<> {
-    void validate_options(const rapidjson::Value& options) const
-    {
-        if (!options.IsObject() ||
-            !((options.HasMember("size") && options.HasMember("stencil")) ||
-              options.HasMember("filename"))) {
-            std::cerr
-                << "Input has to be a JSON array of matrix configurations:\n"
-                << example_config << std::endl;
-            std::exit(1);
-        }
-    }
-};
+using Generator = DefaultSystemGenerator<>;
 
 
 int main(int argc, char* argv[])
 {
     std::string header =
         "A benchmark for measuring performance of Ginkgo's spmv.\n";
-    std::string format = example_config;
+    std::string format = Generator::get_example_config();
     initialize_argument_parsing_matrix(&argc, &argv, header, format);
 
     std::string extra_information = "The formats are " + FLAGS_formats +
                                     "\nThe number of right hand sides is " +
-                                    std::to_string(FLAGS_nrhs) + "\n";
+                                    std::to_string(FLAGS_nrhs);
     print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    auto formats = split(FLAGS_formats, ',');
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(get_input_stream());
 
-    run_spmv_benchmark(exec, test_cases, formats, Generator{},
-                       get_timer(exec, FLAGS_gpu_timer), true);
+    run_test_cases(SpmvBenchmark<Generator>{Generator{}, split(FLAGS_formats)},
+                   exec, get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
index 3c8d886df3b..4a7d014de8b 100644
--- a/benchmark/spmv/spmv_common.hpp
+++ b/benchmark/spmv/spmv_common.hpp
@@ -36,7 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
@@ -48,57 +50,119 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 DEFINE_uint32(nrhs, 1, "The number of right hand sides");
 
 
-// This function supposes that management of `FLAGS_overwrite` is done before
-// calling it
-template <typename Generator, typename VectorType, typename IndexType>
-void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
-                const Generator& generator, std::shared_ptr<Timer> timer,
-                const gko::matrix_data<etype, IndexType>& data,
-                const VectorType* b, const VectorType* x,
-                const VectorType* answer, rapidjson::Value& test_case,
-                rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& spmv_case = test_case["spmv"];
-        add_or_set_member(spmv_case, format_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+template <typename Generator>
+struct spmv_benchmark_state {
+    gko::matrix_data<etype, typename Generator::index_type> data;
+    std::unique_ptr<typename Generator::Vec> x;
+    std::unique_ptr<typename Generator::Vec> b;
+    std::unique_ptr<typename Generator::Vec> answer;
+};
+
+
+template <typename Generator>
+struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
+    using Vec = typename Generator::Vec;
+    std::string name;
+    std::vector<std::string> formats;
+    bool do_print;
+    Generator generator;
+
+    SpmvBenchmark(Generator generator, std::vector<std::string> formats,
+                  bool do_print = true)
+        : name{"spmv"},
+          formats{std::move(formats)},
+          generator{generator},
+          do_print{do_print}
+    {}
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return formats;
+    }
+
+    bool should_print() const override { return do_print; }
 
+    std::string get_example_config() const override
+    {
+        return generator.get_example_config();
+    }
+
+    bool validate_config(const json& test_case) const override
+    {
+        return generator.validate_config(test_case);
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return generator.describe_config(test_case);
+    }
+
+    spmv_benchmark_state<Generator> setup(std::shared_ptr<gko::Executor> exec,
+                                          json& test_case) const override
+    {
+        spmv_benchmark_state<Generator> state;
+        state.data = generator.generate_matrix_data(test_case);
+
+        auto nrhs = FLAGS_nrhs;
+        state.b = generator.create_multi_vector_random(
+            exec, gko::dim<2>{state.data.size[1], nrhs});
+        state.x = generator.create_multi_vector_random(
+            exec, gko::dim<2>{state.data.size[0], nrhs});
+        if (do_print) {
+            std::clog << "Matrix is of size (" << state.data.size[0] << ", "
+                      << state.data.size[1] << "), "
+                      << state.data.nonzeros.size() << std::endl;
+        }
+        test_case["rows"] = state.data.size[0];
+        test_case["cols"] = state.data.size[1];
+        test_case["nonzeros"] = state.data.nonzeros.size();
+        if (FLAGS_detailed) {
+            state.answer = gko::clone(state.x);
+            auto system_matrix =
+                generator.generate_matrix_with_default_format(exec, state.data);
+            exec->synchronize();
+            system_matrix->apply(state.b, state.answer);
+            exec->synchronize();
+        }
+        return state;
+    }
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             spmv_benchmark_state<Generator>& state,
+             const std::string& format_name, json& format_case) const override
+    {
         auto system_matrix = generator.generate_matrix_with_format(
-            exec, format_name, data, &spmv_case[format_name], &allocator);
+            exec, format_name, state.data, &format_case);
 
         // check the residual
         if (FLAGS_detailed) {
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
             exec->synchronize();
-            system_matrix->apply(b, x_clone);
+            system_matrix->apply(state.b, x_clone);
             exec->synchronize();
             auto max_relative_norm2 =
-                compute_max_relative_norm2(x_clone.get(), answer);
-            add_or_set_member(spmv_case[format_name], "max_relative_norm2",
-                              max_relative_norm2, allocator);
+                compute_max_relative_norm2(x_clone.get(), state.answer.get());
+            format_case["max_relative_norm2"] = max_relative_norm2;
         }
 
         IterationControl ic{timer};
         // warm run
         for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
             exec->synchronize();
-            system_matrix->apply(b, x_clone);
+            system_matrix->apply(state.b, x_clone);
             exec->synchronize();
         }
 
         // tuning run
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
         auto& format_case = spmv_case[format_name];
-        if (!format_case.HasMember("tuning")) {
-            format_case.AddMember(
-                "tuning", rapidjson::Value(rapidjson::kObjectType), allocator);
-        }
+        format_case["tuning"] = json::object();
         auto& tuning_case = format_case["tuning"];
-        add_or_set_member(tuning_case, "time",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(tuning_case, "values",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        tuning_case["time"] = json::array();
+        tuning_case["values"] = json::array();
 
         // Enable tuning for this portion of code
         gko::_tuning_flag = true;
@@ -112,13 +176,13 @@ void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
             gko::_tuned_value = val;
             auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
             IterationControl ic_tuning{tuning_timer};
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
             for (auto _ : ic_tuning.run()) {
-                system_matrix->apply(b, x_clone);
+                system_matrix->apply(state.b, x_clone);
             }
-            tuning_case["time"].PushBack(
-                ic_tuning.compute_time(FLAGS_timer_method), allocator);
-            tuning_case["values"].PushBack(val, allocator);
+            tuning_case["time"].push_back(
+                ic_tuning.compute_time(FLAGS_timer_method));
+            tuning_case["values"].push_back(val);
         }
         // We put back the flag to false to use the default (non-tuned) values
         // for the following
@@ -126,142 +190,41 @@ void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
 #endif  // GINKGO_BENCHMARK_ENABLE_TUNING
 
         // timed run
-        auto x_clone = clone(x);
+        auto x_clone = clone(state.x);
         for (auto _ : ic.run()) {
-            system_matrix->apply(b, x_clone);
-        }
-        add_or_set_member(spmv_case[format_name], "time",
-                          ic.compute_time(FLAGS_timer_method), allocator);
-        add_or_set_member(spmv_case[format_name], "repetitions",
-                          ic.get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(spmv_case[format_name], "completed", true, allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["spmv"][format_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["spmv"][format_name], "error",
-                              msg_value, allocator);
+            system_matrix->apply(state.b, x_clone);
         }
-        std::cerr << "Error when processing test case\n"
-                  << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
+        format_case["time"] = ic.compute_time(FLAGS_timer_method);
+        format_case["repetitions"] = ic.get_num_repetitions();
     }
-}
-
-
-template <typename SystemGenerator>
-void run_spmv_benchmark(std::shared_ptr<gko::Executor> exec,
-                        rapidjson::Document& test_cases,
-                        const std::vector<std::string> formats,
-                        const SystemGenerator& system_generator,
-                        std::shared_ptr<Timer> timer, bool do_print)
-{
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            system_generator.validate_options(test_case);
-            if (!test_case.HasMember("spmv")) {
-                test_case.AddMember("spmv",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& spmv_case = test_case["spmv"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(formats), end(formats),
-                       [&spmv_case](const std::string& s) {
-                           return spmv_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            if (do_print) {
-                std::clog << "Running test case\n" << test_case << std::endl;
-            }
-            // annotate the test case
-            auto test_case_range =
-                annotate(system_generator.describe_config(test_case));
-
-            auto data = system_generator.generate_matrix_data(test_case);
-
-            auto nrhs = FLAGS_nrhs;
-            auto b = system_generator.create_multi_vector_random(
-                exec, gko::dim<2>{data.size[1], nrhs});
-            auto x = system_generator.create_multi_vector_random(
-                exec, gko::dim<2>{data.size[0], nrhs});
-            if (do_print) {
-                std::clog << "Matrix is of size (" << data.size[0] << ", "
-                          << data.size[1] << ")" << std::endl;
-            }
-            add_or_set_member(test_case, "size", data.size[0], allocator);
-            add_or_set_member(test_case, "nnz", data.nonzeros.size(),
-                              allocator);
-            auto best_performance = std::numeric_limits<double>::max();
-            if (!test_case.HasMember("optimal")) {
-                test_case.AddMember("optimal",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
 
-            // Compute the result from ginkgo::coo as the correct answer
-            auto answer = gko::clone(x);
-            if (FLAGS_detailed) {
-                auto system_matrix =
-                    system_generator.generate_matrix_with_default_format(exec,
-                                                                         data);
-                exec->synchronize();
-                system_matrix->apply(b, answer);
-                exec->synchronize();
+    void postprocess(json& test_case) const override
+    {
+        if (!test_case.contains("optimal")) {
+            test_case["optimal"] = json::object();
+        }
+        auto best_time = std::numeric_limits<double>::max();
+        std::string best_format;
+        // find the fastest among all formats we tested
+        for (const auto& format : formats) {
+            if (!test_case[name].contains(format)) {
+                continue;
             }
-            for (const auto& format_name : formats) {
-                {
-                    auto format_range = annotate(format_name.c_str());
-                    apply_spmv(format_name.c_str(), exec, system_generator,
-                               timer, data, b.get(), x.get(), answer.get(),
-                               test_case, allocator);
-                }
-                if (do_print) {
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-                }
-                if (spmv_case[format_name.c_str()]["completed"].GetBool()) {
-                    auto performance =
-                        spmv_case[format_name.c_str()]["time"].GetDouble();
-                    if (performance < best_performance) {
-                        best_performance = performance;
-                        add_or_set_member(
-                            test_case["optimal"], "spmv",
-                            rapidjson::Value(format_name.c_str(), allocator)
-                                .Move(),
-                            allocator);
-                    }
-                }
-                if (do_print) {
-                    backup_results(test_cases);
+            auto& format_case = test_case[name][format];
+            if (format_case.contains("completed") &&
+                format_case["completed"].template get<bool>()) {
+                auto time = format_case["time"];
+                if (time < best_time) {
+                    best_time = time;
+                    best_format = format;
                 }
             }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
-            }
+        }
+        if (!best_format.empty()) {
+            test_case["optimal"][name] = best_format;
         }
     }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-}
+};
+
 
 #endif  // GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP
diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr
index abc496b0921..b64f4321287 100644
--- a/benchmark/test/reference/blas.profile.stderr
+++ b/benchmark/test/reference/blas.profile.stderr
@@ -10,6 +10,7 @@ Running test case
     "blas": {}
 }
 DEBUG: begin n = 100 
+	Running blas: copy
 DEBUG: begin copy
 DEBUG: begin allocate
 DEBUG: end   allocate
@@ -24,21 +25,7 @@ DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   copy
-Current state:
-[
-    {
-        "n": 100,
-        "blas": {
-            "copy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        }
-    }
-]
+	Running blas: axpy
 DEBUG: begin axpy
 DEBUG: begin allocate
 DEBUG: end   allocate
@@ -61,28 +48,7 @@ DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   axpy
-Current state:
-[
-    {
-        "n": 100,
-        "blas": {
-            "copy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            },
-            "axpy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        }
-    }
-]
+	Running blas: scal
 DEBUG: begin scal
 DEBUG: begin allocate
 DEBUG: end   allocate
@@ -99,33 +65,4 @@ DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   scal
-Current state:
-[
-    {
-        "n": 100,
-        "blas": {
-            "copy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            },
-            "axpy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            },
-            "scal": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        }
-    }
-]
 DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr
index 9508b0dcf1e..f41b25c6ee1 100644
--- a/benchmark/test/reference/blas.simple.stderr
+++ b/benchmark/test/reference/blas.simple.stderr
@@ -9,69 +9,6 @@ Running test case
     "n": 100,
     "blas": {}
 }
-Current state:
-[
-    {
-        "n": 100,
-        "blas": {
-            "copy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "n": 100,
-        "blas": {
-            "copy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "axpy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "n": 100,
-        "blas": {
-            "copy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "axpy": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "scal": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
+	Running blas: copy
+	Running blas: axpy
+	Running blas: scal
diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr
index 9ab8a899649..1d5df7477ba 100644
--- a/benchmark/test/reference/conversion.all.stderr
+++ b/benchmark/test/reference/conversion.all.stderr
@@ -4,1853 +4,23 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr,ell,sellp,hybrid
-Benchmarking conversions. 
 Running test case
 {
     "size": 100,
     "stencil": "7pt",
-    "conversions": {}
-}
-Matrix is of size (125, 125)
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-coo": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "sellp-ell": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "sellp-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-hybrid": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-hybrid": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "sellp-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "hybrid-coo": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-coo": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "sellp-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "hybrid-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "hybrid-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "hybrid-ell": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "hybrid-ell": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
-Error when processing test case
-{
-    "size": 125,
-    "stencil": "7pt",
-    "conversions": {
-        "coo-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "coo-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "coo-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "csr-coo": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-ell": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-sellp": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "csr-hybrid": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "ell-sellp": {
-            "completed": false,
-            "error": ""
-        },
-        "ell-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "sellp-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "sellp-hybrid": {
-            "completed": false,
-            "error": ""
-        },
-        "hybrid-coo": {
-            "completed": false,
-            "error": ""
-        },
-        "hybrid-csr": {
-            "time": 1.0,
-            "repetitions": 10,
-            "completed": true
-        },
-        "hybrid-ell": {
-            "completed": false,
-            "error": ""
-        },
-        "hybrid-sellp": {
-            "completed": false,
-            "error": ""
-        }
-    }
-}
-what(): <removed>
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-ell": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-sellp": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-hybrid": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-coo": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "hybrid-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-sellp": {
-                "completed": false,
-                "error": ""
-            }
-        }
-    }
-]
+    "conversion": {}
+}
+Matrix is of size (125, 125), 725
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
+	Running conversion: csr-ell
+	Running conversion: csr-sellp
+	Running conversion: csr-hybrid
+	Running conversion: ell-read
+	Running conversion: ell-csr
+	Running conversion: sellp-read
+	Running conversion: sellp-csr
+	Running conversion: hybrid-read
+	Running conversion: hybrid-csr
diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout
index cb53bb81a6c..c4b657a42c4 100644
--- a/benchmark/test/reference/conversion.all.stdout
+++ b/benchmark/test/reference/conversion.all.stdout
@@ -1,25 +1,23 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
+        "conversion": {
+            "coo-read": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             },
-            "coo-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "coo-sellp": {
-                "completed": false,
-                "error": ""
+            "coo-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
             },
-            "coo-hybrid": {
-                "completed": false,
-                "error": ""
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
             },
             "csr-coo": {
                 "time": 1.0,
@@ -41,57 +39,39 @@
                 "repetitions": 10,
                 "completed": true
             },
-            "ell-coo": {
-                "completed": false,
-                "error": ""
+            "ell-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
             },
             "ell-csr": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             },
-            "ell-sellp": {
-                "completed": false,
-                "error": ""
-            },
-            "ell-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-coo": {
-                "completed": false,
-                "error": ""
+            "sellp-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
             },
             "sellp-csr": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             },
-            "sellp-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "sellp-hybrid": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-coo": {
-                "completed": false,
-                "error": ""
+            "hybrid-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
             },
             "hybrid-csr": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
-            },
-            "hybrid-ell": {
-                "completed": false,
-                "error": ""
-            },
-            "hybrid-sellp": {
-                "completed": false,
-                "error": ""
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
     }
 ]
diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr
index 1d604175479..369a363a53e 100644
--- a/benchmark/test/reference/conversion.matrix.stderr
+++ b/benchmark/test/reference/conversion.matrix.stderr
@@ -4,43 +4,13 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Benchmarking conversions. 
 Running test case
 {
     "filename": "",
-    "conversions": {}
+    "conversion": {}
 }
-Matrix is of size (36, 36)
-Current state:
-[
-    {
-        "filename": "",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        },
-        "size": 36
-    }
-]
-Current state:
-[
-    {
-        "filename": "",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        },
-        "size": 36
-    }
-]
+Matrix is of size (36, 36), 208
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout
index e43edda0595..7e537fa4919 100644
--- a/benchmark/test/reference/conversion.matrix.stdout
+++ b/benchmark/test/reference/conversion.matrix.stdout
@@ -2,18 +2,30 @@
 [
     {
         "filename": "",
-        "conversions": {
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
             "coo-csr": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
             "csr-coo": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             }
         },
-        "size": 36
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
     }
 ]
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index 6733472be8f..089e6be02f9 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -4,15 +4,16 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Benchmarking conversions. 
 Running test case
 {
     "size": 100,
     "stencil": "7pt",
-    "conversions": {}
+    "conversion": {}
 }
-Matrix is of size (125, 125)
-DEBUG: begin stencil(125,7pt)
+Matrix is of size (125, 125), 725
+DEBUG: begin stencil(100,7pt)
+	Running conversion: coo-read
+DEBUG: begin coo-read
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin allocate
@@ -21,13 +22,17 @@ DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   coo-read
+	Running conversion: coo-csr
 DEBUG: begin coo-csr
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin components::fill_array
-DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin allocate
@@ -36,12 +41,8 @@ DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin components::convert_idxs_to_ptrs
-DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
 DEBUG: begin copy(<typename>)
 DEBUG: begin allocate
 DEBUG: end   allocate
@@ -49,14 +50,10 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin components::convert_idxs_to_ptrs
@@ -68,27 +65,15 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   coo-csr
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        }
-    }
-]
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
+DEBUG: end   coo-csr
+	Running conversion: csr-read
+DEBUG: begin csr-read
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::fill_array
@@ -109,32 +94,46 @@ DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
 DEBUG: begin free
 DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   csr-read
+	Running conversion: csr-coo
 DEBUG: begin csr-coo
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin allocate
+DEBUG: end   allocate
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin copy(<typename>)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin free
 DEBUG: end   free
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy(<typename>)
+DEBUG: begin allocate
+DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_ptrs_to_idxs
 DEBUG: end   components::convert_ptrs_to_idxs
 DEBUG: end   copy(<typename>)
@@ -144,30 +143,11 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   csr-coo
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 1,
-                "completed": true
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        }
-    }
-]
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   stencil(125,7pt)
+DEBUG: end   csr-coo
+DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout
index 3e76bc26934..b29815f6c17 100644
--- a/benchmark/test/reference/conversion.profile.stdout
+++ b/benchmark/test/reference/conversion.profile.stdout
@@ -1,19 +1,32 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
-        "conversions": {
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
             "coo-csr": {
                 "time": 1.0,
                 "repetitions": 1,
                 "completed": true
             },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
             "csr-coo": {
                 "time": 1.0,
                 "repetitions": 1,
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
     }
 ]
diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr
index d221ead12a4..a814dba6888 100644
--- a/benchmark/test/reference/conversion.simple.stderr
+++ b/benchmark/test/reference/conversion.simple.stderr
@@ -4,44 +4,14 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Benchmarking conversions. 
 Running test case
 {
     "size": 100,
     "stencil": "7pt",
-    "conversions": {}
+    "conversion": {}
 }
-Matrix is of size (125, 125)
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "conversions": {
-            "coo-csr": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            },
-            "csr-coo": {
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        }
-    }
-]
+Matrix is of size (125, 125), 725
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout
index 9ecdd46f5e1..856f1330eea 100644
--- a/benchmark/test/reference/conversion.simple.stdout
+++ b/benchmark/test/reference/conversion.simple.stdout
@@ -1,19 +1,32 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
-        "conversions": {
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
             "coo-csr": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
             "csr-coo": {
                 "time": 1.0,
                 "repetitions": 10,
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
     }
 ]
diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout
index 34fdda13e55..cd3c7b8bd43 100644
--- a/benchmark/test/reference/distributed_solver.matrix.stdout
+++ b/benchmark/test/reference/distributed_solver.matrix.stdout
@@ -52,6 +52,7 @@
                 "completed": true
             }
         },
-        "size": 36
+        "rows": 36,
+        "cols": 36
     }
 ]
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
index efd79f66dc5..e583a1411a8 100644
--- a/benchmark/test/reference/distributed_solver.profile.stderr
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -5,7 +5,6 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-DEBUG: begin stencil(100,7pt,stencil)
 Running test case
 {
     "size": 100,
@@ -213,9 +212,9 @@ DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 Matrix is of size (125, 125)
-DEBUG: begin cg
+DEBUG: begin stencil(100,7pt,stencil)
 	Running solver: cg
-DEBUG: begin none
+DEBUG: begin cg
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin dense::compute_squared_norm2
@@ -670,8 +669,8 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   none
 DEBUG: end   cg
+DEBUG: end   stencil(100,7pt,stencil)
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
@@ -686,4 +685,3 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   stencil(100,7pt,stencil)
diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout
index c61541a5d5b..aef92652256 100644
--- a/benchmark/test/reference/distributed_solver.profile.stdout
+++ b/benchmark/test/reference/distributed_solver.profile.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "comm_pattern": "stencil",
         "optimal": {
@@ -27,6 +27,8 @@
                 "repetitions": 1,
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125
     }
 ]
diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout
index 54d7233ba77..002b9d91347 100644
--- a/benchmark/test/reference/distributed_solver.simple.stdout
+++ b/benchmark/test/reference/distributed_solver.simple.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "comm_pattern": "stencil",
         "optimal": {
@@ -53,6 +53,8 @@
                 "repetitions": 1,
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125
     }
 ]
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr
index af205c778c0..7bb33842f25 100644
--- a/benchmark/test/reference/matrix_statistics.matrix.stderr
+++ b/benchmark/test/reference/matrix_statistics.matrix.stderr
@@ -5,4 +5,4 @@ Running test case
     "filename": "",
     "problem": {}
 }
-Matrix is of size (36, 36)
+Matrix is of size (36, 36), 208
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout
index a056241669b..ea73587fde4 100644
--- a/benchmark/test/reference/matrix_statistics.matrix.stdout
+++ b/benchmark/test/reference/matrix_statistics.matrix.stdout
@@ -33,6 +33,8 @@
                 "hyperflatness": 6.0545648993883665
             }
         },
-        "size": 36
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
     }
 ]
diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr
index 6b853c3f4ea..75a7cca709f 100644
--- a/benchmark/test/reference/matrix_statistics.simple.stderr
+++ b/benchmark/test/reference/matrix_statistics.simple.stderr
@@ -6,4 +6,4 @@ Running test case
     "stencil": "7pt",
     "problem": {}
 }
-Matrix is of size (125, 125)
+Matrix is of size (125, 125), 725
diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout
index 4470784e7c5..13746ce8a46 100644
--- a/benchmark/test/reference/matrix_statistics.simple.stdout
+++ b/benchmark/test/reference/matrix_statistics.simple.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "problem": {
             "rows": 125,
@@ -33,6 +33,9 @@
                 "hyperskewness": -1.741577812922432,
                 "hyperflatness": 7.762345679012379
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
     }
 ]
diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr
index c9ef583d79e..4088a20c925 100644
--- a/benchmark/test/reference/preconditioner.matrix.stderr
+++ b/benchmark/test/reference/preconditioner.matrix.stderr
@@ -9,34 +9,5 @@ Running test case
     "filename": "",
     "preconditioner": {}
 }
-Matrix is of size (36, 36)
-Current state:
-[
-    {
-        "filename": "",
-        "preconditioner": {
-            "none": {
-                "generate": {
-                    "components": {
-                        "generate(<typename>)": 1.0,
-                        "overhead": 1.0
-                    },
-                    "time": 1.0,
-                    "repetitions": 10
-                },
-                "apply": {
-                    "components": {
-                        "apply(<typename>)": 1.0,
-                        "copy(<typename>)": 1.0,
-                        "dense::copy": 1.0,
-                        "overhead": 1.0
-                    },
-                    "time": 1.0,
-                    "repetitions": 10
-                },
-                "completed": true
-            }
-        },
-        "size": 36
-    }
-]
+Matrix is of size (36, 36), 208
+	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout
index 77979f4c54b..0415a87ea8d 100644
--- a/benchmark/test/reference/preconditioner.matrix.stdout
+++ b/benchmark/test/reference/preconditioner.matrix.stdout
@@ -25,6 +25,8 @@
                 "completed": true
             }
         },
-        "size": 36
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
     }
 ]
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
index 5b47bc9bd94..c215b22c925 100644
--- a/benchmark/test/reference/preconditioner.profile.stderr
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -10,7 +10,6 @@ Running test case
     "stencil": "7pt",
     "preconditioner": {}
 }
-DEBUG: begin stencil(100,7pt)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::fill_array
@@ -59,7 +58,9 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-Matrix is of size (125, 125)
+Matrix is of size (125, 125), 725
+DEBUG: begin stencil(100,7pt)
+	Running preconditioner: none
 DEBUG: begin none
 DEBUG: begin copy(<typename>)
 DEBUG: begin allocate
@@ -78,28 +79,7 @@ DEBUG: end   apply(<typename>)
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   none
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "preconditioner": {
-            "none": {
-                "generate": {
-                    "components": {},
-                    "time": 1.0,
-                    "repetitions": 1
-                },
-                "apply": {
-                    "components": {},
-                    "time": 1.0,
-                    "repetitions": 1
-                },
-                "completed": true
-            }
-        }
-    }
-]
+DEBUG: end   stencil(100,7pt)
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
@@ -110,4 +90,3 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout
index cc73c4c4552..f53407d818d 100644
--- a/benchmark/test/reference/preconditioner.profile.stdout
+++ b/benchmark/test/reference/preconditioner.profile.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "preconditioner": {
             "none": {
@@ -17,6 +17,9 @@
                 },
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
     }
 ]
diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr
index d480d4fedbd..07d2cca6704 100644
--- a/benchmark/test/reference/preconditioner.simple.stderr
+++ b/benchmark/test/reference/preconditioner.simple.stderr
@@ -10,34 +10,5 @@ Running test case
     "stencil": "7pt",
     "preconditioner": {}
 }
-Matrix is of size (125, 125)
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "preconditioner": {
-            "none": {
-                "generate": {
-                    "components": {
-                        "generate(<typename>)": 1.0,
-                        "overhead": 1.0
-                    },
-                    "time": 1.0,
-                    "repetitions": 10
-                },
-                "apply": {
-                    "components": {
-                        "apply(<typename>)": 1.0,
-                        "copy(<typename>)": 1.0,
-                        "dense::copy": 1.0,
-                        "overhead": 1.0
-                    },
-                    "time": 1.0,
-                    "repetitions": 10
-                },
-                "completed": true
-            }
-        }
-    }
-]
+Matrix is of size (125, 125), 725
+	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout
index c47146a72e1..92bb51ddb57 100644
--- a/benchmark/test/reference/preconditioner.simple.stdout
+++ b/benchmark/test/reference/preconditioner.simple.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "preconditioner": {
             "none": {
@@ -25,6 +25,9 @@
                 },
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
     }
 ]
diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout
index 6a1f8ceb959..56577288c2d 100644
--- a/benchmark/test/reference/solver.matrix.stdout
+++ b/benchmark/test/reference/solver.matrix.stdout
@@ -50,6 +50,7 @@
                 "completed": true
             }
         },
-        "size": 36
+        "rows": 36,
+        "cols": 36
     }
 ]
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
index 65b7560d936..0c3f7060796 100644
--- a/benchmark/test/reference/solver.profile.stderr
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -5,7 +5,6 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-DEBUG: begin stencil(100,7pt)
 Running test case
 {
     "size": 100,
@@ -62,9 +61,9 @@ DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 Matrix is of size (125, 125)
-DEBUG: begin cg
+DEBUG: begin stencil(100,7pt)
 	Running solver: cg
-DEBUG: begin none
+DEBUG: begin cg
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin dense::compute_norm2_dispatch
@@ -425,8 +424,8 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   none
 DEBUG: end   cg
+DEBUG: end   stencil(100,7pt)
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
@@ -437,4 +436,3 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout
index 128a8a1f169..0148e6ef092 100644
--- a/benchmark/test/reference/solver.profile.stdout
+++ b/benchmark/test/reference/solver.profile.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "optimal": {
             "spmv": "csr"
@@ -26,6 +26,8 @@
                 "repetitions": 1,
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125
     }
 ]
diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout
index c6055339d67..b4e7b56b2bf 100644
--- a/benchmark/test/reference/solver.simple.stdout
+++ b/benchmark/test/reference/solver.simple.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "optimal": {
             "spmv": "csr"
@@ -50,6 +50,8 @@
                 "repetitions": 1,
                 "completed": true
             }
-        }
+        },
+        "rows": 125,
+        "cols": 125
     }
 ]
diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr
index 5001c604e72..ff52b6a3269 100644
--- a/benchmark/test/reference/sparse_blas.matrix.stderr
+++ b/benchmark/test/reference/sparse_blas.matrix.stderr
@@ -3,34 +3,11 @@ This is Ginkgo 1.7.0 (develop)
 Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
-The operations are transposeRunning test case
+The operations are transpose
+Running test case
 {
     "filename": "",
     "sparse_blas": {}
 }
 Matrix is of size (36, 36), 208
-Current state:
-[
-    {
-        "filename": "",
-        "sparse_blas": {
-            "transpose": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "components": {
-                    "allocate": 1.0,
-                    "components::fill_array": 1.0,
-                    "csr::transpose": 1.0,
-                    "free": 1.0,
-                    "overhead": 1.0
-                },
-                "completed": true
-            }
-        },
-        "rows": 36,
-        "cols": 36,
-        "nonzeros": 208
-    }
-]
+	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
index d05f5117b8e..d1434dad146 100644
--- a/benchmark/test/reference/sparse_blas.profile.stderr
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -3,7 +3,8 @@ This is Ginkgo 1.7.0 (develop)
 Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
-The operations are transposeRunning test case
+The operations are transpose
+Running test case
 {
     "size": 100,
     "stencil": "7pt",
@@ -35,6 +36,7 @@ DEBUG: end   components::convert_idxs_to_ptrs
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin stencil(100,7pt)
+	Running sparse_blas: transpose
 DEBUG: begin transpose
 DEBUG: begin allocate
 DEBUG: end   allocate
@@ -53,25 +55,6 @@ DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   transpose
-Current state:
-[
-    {
-        "size": 100,
-        "stencil": "7pt",
-        "sparse_blas": {
-            "transpose": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        },
-        "rows": 125,
-        "cols": 125,
-        "nonzeros": 725
-    }
-]
 DEBUG: end   stencil(100,7pt)
 DEBUG: begin free
 DEBUG: end   free
diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr
index bf5001f67b7..452374a9268 100644
--- a/benchmark/test/reference/sparse_blas.simple.stderr
+++ b/benchmark/test/reference/sparse_blas.simple.stderr
@@ -3,36 +3,12 @@ This is Ginkgo 1.7.0 (develop)
 Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
-The operations are transposeRunning test case
+The operations are transpose
+Running test case
 {
     "size": 100,
     "stencil": "7pt",
     "sparse_blas": {}
 }
 Matrix is of size (125, 125), 725
-Current state:
-[
-    {
-        "size": 100,
-        "stencil": "7pt",
-        "sparse_blas": {
-            "transpose": {
-                "time": 1.0,
-                "flops": 1.0,
-                "bandwidth": 1.0,
-                "repetitions": 10,
-                "components": {
-                    "allocate": 1.0,
-                    "components::fill_array": 1.0,
-                    "csr::transpose": 1.0,
-                    "free": 1.0,
-                    "overhead": 1.0
-                },
-                "completed": true
-            }
-        },
-        "rows": 125,
-        "cols": 125,
-        "nonzeros": 725
-    }
-]
+	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr
index 8d942cd0de5..a618da5b321 100644
--- a/benchmark/test/reference/spmv.matrix.stderr
+++ b/benchmark/test/reference/spmv.matrix.stderr
@@ -10,22 +10,5 @@ Running test case
     "filename": "",
     "spmv": {}
 }
-Matrix is of size (36, 36)
-Current state:
-[
-    {
-        "filename": "",
-        "spmv": {
-            "coo": {
-                "storage": 3328,
-                "max_relative_norm2": 1.0,
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        },
-        "size": 36,
-        "nnz": 208,
-        "optimal": {}
-    }
-]
+Matrix is of size (36, 36), 208
+	Running spmv: coo
diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout
index 47035c27549..dc30ab6b284 100644
--- a/benchmark/test/reference/spmv.matrix.stdout
+++ b/benchmark/test/reference/spmv.matrix.stdout
@@ -11,8 +11,9 @@
                 "completed": true
             }
         },
-        "size": 36,
-        "nnz": 208,
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208,
         "optimal": {
             "spmv": "coo"
         }
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
index 961ac587990..09a10b725ea 100644
--- a/benchmark/test/reference/spmv.profile.stderr
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -11,7 +11,6 @@ Running test case
     "stencil": "7pt",
     "spmv": {}
 }
-DEBUG: begin stencil(100,7pt)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin allocate
@@ -52,13 +51,9 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-Matrix is of size (125, 125)
-DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin dense::copy
-DEBUG: end   dense::copy
-DEBUG: end   copy(<typename>)
+Matrix is of size (125, 125), 725
+DEBUG: begin stencil(100,7pt)
+	Running spmv: coo
 DEBUG: begin coo
 DEBUG: begin allocate
 DEBUG: end   allocate
@@ -87,27 +82,8 @@ DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   coo
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "spmv": {
-            "coo": {
-                "storage": 11600,
-                "time": 1.0,
-                "repetitions": 1,
-                "completed": true
-            }
-        },
-        "nnz": 725,
-        "optimal": {}
-    }
-]
-DEBUG: begin free
-DEBUG: end   free
+DEBUG: end   stencil(100,7pt)
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
 DEBUG: end   free
-DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout
index dacc490ddf0..5302d54f9f0 100644
--- a/benchmark/test/reference/spmv.profile.stdout
+++ b/benchmark/test/reference/spmv.profile.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "spmv": {
             "coo": {
@@ -11,7 +11,9 @@
                 "completed": true
             }
         },
-        "nnz": 725,
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725,
         "optimal": {
             "spmv": "coo"
         }
diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr
index dc9933b40ec..a910512ff31 100644
--- a/benchmark/test/reference/spmv.simple.stderr
+++ b/benchmark/test/reference/spmv.simple.stderr
@@ -11,22 +11,5 @@ Running test case
     "stencil": "7pt",
     "spmv": {}
 }
-Matrix is of size (125, 125)
-Current state:
-[
-    {
-        "size": 125,
-        "stencil": "7pt",
-        "spmv": {
-            "coo": {
-                "storage": 11600,
-                "max_relative_norm2": 1.0,
-                "time": 1.0,
-                "repetitions": 10,
-                "completed": true
-            }
-        },
-        "nnz": 725,
-        "optimal": {}
-    }
-]
+Matrix is of size (125, 125), 725
+	Running spmv: coo
diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout
index 90f8903a452..737938d7c96 100644
--- a/benchmark/test/reference/spmv.simple.stdout
+++ b/benchmark/test/reference/spmv.simple.stdout
@@ -1,7 +1,7 @@
 
 [
     {
-        "size": 125,
+        "size": 100,
         "stencil": "7pt",
         "spmv": {
             "coo": {
@@ -12,7 +12,9 @@
                 "completed": true
             }
         },
-        "nnz": 725,
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725,
         "optimal": {
             "spmv": "coo"
         }
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index b7ec0e72cf1..41acb560ba1 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <array>
 #include <fstream>
 #include <functional>
+#include <iomanip>
 #include <map>
 #include <ostream>
 #include <random>
@@ -53,10 +54,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <gflags/gflags.h>
-#include <rapidjson/document.h>
-#include <rapidjson/istreamwrapper.h>
-#include <rapidjson/ostreamwrapper.h>
-#include <rapidjson/prettywriter.h>
 
 
 #include <ginkgo/core/base/memory.hpp>
@@ -100,10 +97,6 @@ DEFINE_string(
 DEFINE_bool(detailed, true,
             "If set, performs several runs to obtain more detailed results");
 
-DEFINE_bool(keep_errors, true,
-            "If set, writes exception messages during the execution into the "
-            "JSON output");
-
 DEFINE_bool(nested_names, false, "If set, separately logs nested operations");
 
 DEFINE_bool(profile, false,
@@ -157,27 +150,32 @@ std::unique_ptr<std::istream> input_stream;
  * @param format  the format of the benchmark input data
  */
 void initialize_argument_parsing(int* argc, char** argv[], std::string& header,
-                                 std::string& format)
+                                 std::string& format, bool do_print = true)
 {
-    std::ostringstream doc;
-    doc << header << "Usage: " << (*argv)[0] << " [options]\n"
-        << format
-        << "  The results are written on standard output, in the same "
-           "format,\n"
-        << "  but with test cases extended to include an additional member "
-           "\n"
-        << "  object for each benchmark run.\n"
-        << "  If run with a --backup flag, an intermediate result is "
-           "written \n"
-        << "  to a file in the same format. The backup file can be used as "
-           "\n"
-        << "  input to this test suite, and the benchmarking will \n"
-        << "  continue from the point where the backup file was created.";
-
-    gflags::SetUsageMessage(doc.str());
-    std::ostringstream ver;
-    ver << gko::version_info::get();
-    gflags::SetVersionString(ver.str());
+    if (do_print) {
+        std::ostringstream doc;
+        doc << header << "Usage: " << (*argv)[0] << " [options]\n"
+            << format
+            << "  The results are written on standard output, in the same "
+               "format,\n"
+            << "  but with test cases extended to include an additional member "
+               "\n"
+            << "  object for each benchmark run.\n"
+            << "  If run with a --backup flag, an intermediate result is "
+               "written \n"
+            << "  to a file in the same format. The backup file can be used as "
+               "\n"
+            << "  input to this test suite, and the benchmarking will \n"
+            << "  continue from the point where the backup file was created.";
+
+        gflags::SetUsageMessage(doc.str());
+        std::ostringstream ver;
+        ver << gko::version_info::get();
+        gflags::SetVersionString(ver.str());
+    } else {
+        gflags::SetUsageMessage("");
+        gflags::SetVersionString("");
+    }
     gflags::ParseCommandLineFlags(argc, argv, true);
     if (FLAGS_profile) {
         FLAGS_repetitions = "1";
@@ -206,20 +204,19 @@ void print_general_information(const std::string& extra)
 {
     std::clog << gko::version_info::get() << std::endl
               << "Running on " << FLAGS_executor << "(" << FLAGS_device_id
-              << ")" << std::endl
+              << ")\n"
               << "Running with " << FLAGS_warmup << " warm iterations and ";
     if (FLAGS_repetitions == "auto") {
         std::clog << "adaptively determined repetititions with "
                   << FLAGS_min_repetitions
                   << " <= rep <= " << FLAGS_max_repetitions
-                  << " and a minimal runtime of " << FLAGS_min_runtime << "s"
-                  << std::endl;
+                  << " and a minimal runtime of " << FLAGS_min_runtime << "s\n";
     } else {
-        std::clog << FLAGS_repetitions << " running iterations" << std::endl;
+        std::clog << FLAGS_repetitions << " running iterations\n";
     }
     std::clog << "The random seed for right hand sides is " << FLAGS_seed
-              << std::endl
-              << extra;
+              << '\n'
+              << extra << '\n';
 }
 
 
@@ -319,7 +316,7 @@ std::istream& get_input_stream()
 
 
 // backup generation
-void backup_results(rapidjson::Document& results)
+void backup_results(json& results)
 {
     static int next = 0;
     static auto filenames = []() -> std::array<std::string, 2> {
@@ -576,279 +573,4 @@ gko::remove_complex<ValueType> compute_max_relative_norm2(
 }
 
 
-/**
- * A class for controlling the number warmup and timed iterations.
- *
- * The behavior is determined by the following flags
- * - 'repetitions' switch between fixed and adaptive number of iterations
- * - 'warmup' warmup iterations, applies in fixed and adaptive case
- * - 'min_repetitions' minimal number of repetitions (adaptive case)
- * - 'max_repetitions' maximal number of repetitions (adaptive case)
- * - 'min_runtime' minimal total runtime (adaptive case)
- * - 'repetition_growth_factor' controls the increase between two successive
- *   timings
- *
- * Usage:
- * `IterationControl` exposes the member functions:
- * - `warmup_run()`: controls run defined by `warmup` flag
- * - `run(bool)`: controls run defined by all other flags
- * - `get_timer()`: access to underlying timer
- * The first two methods return an object that is to be used in a range-based
- * for loop:
- * ```
- * IterationControl ic(get_timer(...));
- *
- * // warmup run always uses fixed number of iteration and does not issue
- * // timings
- * for(auto status: ic.warmup_run()){
- *   // execute benchmark
- * }
- * // run may use adaptive number of iterations (depending on cmd line flag)
- * // and issues timing (unless manage_timings is false)
- * for(auto status: ic.run(manage_timings [default is true])){
- *   if(! manage_timings) ic.get_timer->tic();
- *   // execute benchmark
- *   if(! manage_timings) ic.get_timer->toc();
- * }
- *
- * ```
- * At the beginning of both methods, the timer is reset.
- * The `status` object exposes the member
- * - `cur_it`, containing the current iteration number,
- * and the methods
- * - `is_finished`, checks if the benchmark is finished,
- */
-class IterationControl {
-    using IndexType = unsigned int;  //!< to be compatible with GFLAGS type
-
-    class run_control;
-
-public:
-    /**
-     * Creates an `IterationControl` object.
-     *
-     * Uses the commandline flags to setup the stopping criteria for the
-     * warmup and timed run.
-     *
-     * @param timer  the timer that is to be used for the timings
-     */
-    explicit IterationControl(const std::shared_ptr<Timer>& timer)
-    {
-        status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup,
-                          FLAGS_warmup, 0., 0};
-        if (FLAGS_repetitions == "auto") {
-            status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions,
-                           FLAGS_max_repetitions, FLAGS_min_runtime};
-        } else {
-            const auto reps =
-                static_cast<unsigned int>(std::stoi(FLAGS_repetitions));
-            status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0};
-        }
-    }
-
-    IterationControl() = default;
-    IterationControl(const IterationControl&) = default;
-    IterationControl(IterationControl&&) = default;
-
-    /**
-     * Creates iterable `run_control` object for the warmup run.
-     *
-     * This run uses always a fixed number of iterations.
-     */
-    run_control warmup_run()
-    {
-        status_warmup_.cur_it = 0;
-        status_warmup_.managed_timer.clear();
-        return run_control{&status_warmup_};
-    }
-
-    /**
-     * Creates iterable `run_control` object for the timed run.
-     *
-     * This run may be adaptive, depending on the commandline flags.
-     *
-     * @param manage_timings If true, the timer calls (`tic/toc`) are handled
-     * by the `run_control` object, otherwise they need to be executed outside
-     */
-    run_control run(bool manage_timings = true)
-    {
-        status_run_.cur_it = 0;
-        status_run_.managed_timer.clear();
-        status_run_.managed_timer.manage_timings = manage_timings;
-        return run_control{&status_run_};
-    }
-
-    std::shared_ptr<Timer> get_timer() const
-    {
-        return status_run_.managed_timer.timer;
-    }
-
-    /**
-     * Compute the time from the given statistical method
-     *
-     * @param method  the statistical method. If the timer does not have the
-     *                same iteration as the IterationControl, it can only use
-     *                average from the IterationControl.
-     *
-     * @return the statistical time
-     */
-    double compute_time(const std::string& method = "average") const
-    {
-        if (status_run_.managed_timer.timer->get_num_repetitions() ==
-            this->get_num_repetitions()) {
-            return status_run_.managed_timer.compute_time(method);
-        } else {
-            assert(method == "average");
-            return status_run_.managed_timer.get_total_time() /
-                   this->get_num_repetitions();
-        }
-    }
-
-    IndexType get_num_repetitions() const { return status_run_.cur_it; }
-
-private:
-    struct TimerManager {
-        std::shared_ptr<Timer> timer;
-        bool manage_timings = false;
-
-        void tic()
-        {
-            if (manage_timings) {
-                timer->tic();
-            }
-        }
-        void toc(unsigned int num = 1)
-        {
-            if (manage_timings) {
-                timer->toc(num);
-            }
-        }
-
-        void clear() { timer->clear(); }
-
-        double get_total_time() const { return timer->get_total_time(); }
-
-        double compute_time(const std::string& method = "average") const
-        {
-            return timer->compute_time(method);
-        }
-    };
-
-    /**
-     * Stores stopping criteria of the adaptive benchmark run as well as the
-     * current iteration number.
-     */
-    struct status {
-        TimerManager managed_timer{};
-
-        IndexType min_it = 0;
-        IndexType max_it = 0;
-        double max_runtime = 0.;
-
-        IndexType cur_it = 0;
-
-        /**
-         * checks if the adaptive run is complete
-         *
-         * the adaptive run is complete if:
-         * - the minimum number of iteration is reached
-         * - and either:
-         *   - the maximum number of repetitions is reached
-         *   - the total runtime is above the threshold
-         *
-         * @return completeness state of the adaptive run
-         */
-        bool is_finished() const
-        {
-            return cur_it >= min_it &&
-                   (cur_it >= max_it ||
-                    managed_timer.get_total_time() >= max_runtime);
-        }
-    };
-
-    /**
-     * Iterable class managing the benchmark iteration.
-     *
-     * Has to be used in a range-based for loop.
-     */
-    struct run_control {
-        struct iterator {
-            /**
-             * Increases the current iteration count and finishes timing if
-             * necessary.
-             *
-             * As `++it` is the last step of a for-loop, the managed_timer is
-             * stopped, if enough iterations have passed since the last timing.
-             * The interval between two timings is steadily increased to
-             * reduce the timing overhead.
-             */
-            iterator operator++()
-            {
-                cur_info->cur_it++;
-                if (cur_info->cur_it >= next_timing && !stopped) {
-                    cur_info->managed_timer.toc(
-                        static_cast<unsigned>(cur_info->cur_it - start_timing));
-                    stopped = true;
-                    next_timing = static_cast<IndexType>(std::ceil(
-                        next_timing * FLAGS_repetition_growth_factor));
-                    // If repetition_growth_factor <= 1, next_timing will be
-                    // next iteration.
-                    if (next_timing <= cur_info->cur_it) {
-                        next_timing = cur_info->cur_it + 1;
-                    }
-                }
-                return *this;
-            }
-
-            status operator*() const { return *cur_info; }
-
-            /**
-             * Checks if the benchmark is finished and handles timing, if
-             * necessary.
-             *
-             * As `begin != end` is the first step in a for-loop, the
-             * managed_timer is started, if it was previously stopped.
-             * Additionally, if the benchmark is complete and the managed_timer
-             * is still running it is stopped. (This may occur if the maximal
-             * number of repetitions is surpassed)
-             *
-             * Uses only the information from the `status` object, i.e.
-             * the right hand side is ignored.
-             *
-             * @return true if benchmark is not finished, else false
-             */
-            bool operator!=(const iterator&)
-            {
-                const bool is_finished = cur_info->is_finished();
-                if (!is_finished && stopped) {
-                    stopped = false;
-                    cur_info->managed_timer.tic();
-                    start_timing = cur_info->cur_it;
-                } else if (is_finished && !stopped) {
-                    cur_info->managed_timer.toc(
-                        static_cast<unsigned>(cur_info->cur_it - start_timing));
-                    stopped = true;
-                }
-                return !is_finished;
-            }
-
-            status* cur_info;
-            IndexType next_timing = 1;   //!< next iteration to stop timing
-            IndexType start_timing = 0;  //!< iteration for starting timing
-            bool stopped = true;
-        };
-
-        iterator begin() const { return iterator{info}; }
-
-        // not used, could potentially be used in c++17 as a sentinel
-        iterator end() const { return iterator{}; }
-
-        status* info;
-    };
-
-    status status_warmup_;
-    status status_run_;
-};
-
-
 #endif  // GKO_BENCHMARK_UTILS_GENERAL_HPP_
diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp
index 2049dadf45f..39d8b5a8107 100644
--- a/benchmark/utils/general_matrix.hpp
+++ b/benchmark/utils/general_matrix.hpp
@@ -57,9 +57,9 @@ DEFINE_string(input_matrix, "",
  */
 void initialize_argument_parsing_matrix(
     int* argc, char** argv[], std::string& header, std::string& format,
-    std::string additional_matrix_file_json = "")
+    std::string additional_matrix_file_json = "", bool do_print = true)
 {
-    initialize_argument_parsing(argc, argv, header, format);
+    initialize_argument_parsing(argc, argv, header, format, do_print);
     std::string input_matrix_str{FLAGS_input_matrix};
     if (!input_matrix_str.empty()) {
         if (input_stream) {
@@ -67,17 +67,13 @@ void initialize_argument_parsing_matrix(
                 << "-input and -input_matrix cannot be used simultaneously\n";
             std::exit(1);
         }
-        // create JSON for the filename via RapidJSON to ensure the string is
-        // correctly escaped
-        rapidjson::Document d;
+        // create JSON for the filename via nlohmann_json to ensure the string
+        // is correctly escaped
         auto json_template =
             R"([{"filename":"")" + additional_matrix_file_json + "}]";
-        d.Parse(json_template.c_str());
-        d[0]["filename"].SetString(input_matrix_str.c_str(), d.GetAllocator());
-        rapidjson::StringBuffer sb;
-        rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(sb);
-        d.Accept(writer);
-        input_stream = std::make_unique<std::stringstream>(sb.GetString());
+        auto doc = json::parse(json_template);
+        doc[0]["filename"] = input_matrix_str;
+        input_stream = std::make_unique<std::stringstream>(doc.dump());
     }
 }
 
diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp
index 076d2954980..257a2384634 100644
--- a/benchmark/utils/generator.hpp
+++ b/benchmark/utils/generator.hpp
@@ -53,28 +53,45 @@ struct DefaultSystemGenerator {
     using Vec = vec<ValueType>;
 
     static gko::matrix_data<ValueType, IndexType> generate_matrix_data(
-        rapidjson::Value& config)
+        const json& config)
     {
-        if (config.HasMember("filename")) {
-            std::ifstream in(config["filename"].GetString());
+        if (config.contains("filename")) {
+            std::ifstream in(config["filename"].get<std::string>());
             return gko::read_generic_raw<ValueType, IndexType>(in);
-        } else if (config.HasMember("stencil")) {
+        } else if (config.contains("stencil")) {
             return generate_stencil<ValueType, IndexType>(
-                config["stencil"].GetString(), config["size"].GetInt64());
+                config["stencil"].get<std::string>(),
+                config["size"].get<gko::int64>());
         } else {
             throw std::runtime_error(
                 "No known way to generate matrix data found.");
         }
     }
 
-    static std::string describe_config(rapidjson::Value& config)
+    static std::string get_example_config()
     {
-        if (config.HasMember("filename")) {
-            return config["filename"].GetString();
-        } else if (config.HasMember("stencil")) {
+        return json::
+            parse(R"([{"filename": "my_file.mtx"},{"filename": "my_file2.mtx"},{"size": 100, "stencil": "7pt"}])")
+                .dump(4);
+    }
+
+    static bool validate_config(const json& test_case)
+    {
+        return ((test_case.contains("size") && test_case.contains("stencil") &&
+                 test_case["size"].is_number_integer() &&
+                 test_case["stencil"].is_string()) ||
+                (test_case.contains("filename") &&
+                 test_case["filename"].is_string()));
+    }
+
+    static std::string describe_config(const json& config)
+    {
+        if (config.contains("filename")) {
+            return config["filename"].get<std::string>();
+        } else if (config.contains("stencil")) {
             std::stringstream ss;
-            ss << "stencil(" << config["size"].GetInt64() << ","
-               << config["stencil"].GetString() << ")";
+            ss << "stencil(" << config["size"].get<gko::int64>() << ","
+               << config["stencil"].get<std::string>() << ")";
             return ss.str();
         } else {
             throw std::runtime_error("No known way to describe config.");
@@ -82,30 +99,30 @@ struct DefaultSystemGenerator {
     }
 
     static std::shared_ptr<gko::LinOp> generate_matrix_with_optimal_format(
-        std::shared_ptr<gko::Executor> exec, rapidjson::Value& config)
+        std::shared_ptr<gko::Executor> exec, json& config)
     {
         auto data = generate_matrix_data(config);
         return generate_matrix_with_format(
-            std::move(exec), config["optimal"]["spmv"].GetString(), data);
+            std::move(exec), config["optimal"]["spmv"].get<std::string>(),
+            data);
     }
 
     static std::shared_ptr<gko::LinOp> generate_matrix_with_format(
         std::shared_ptr<gko::Executor> exec, const std::string& format_name,
         const gko::matrix_data<ValueType, itype>& data,
-        rapidjson::Value* spmv_case = nullptr,
-        rapidjson::MemoryPoolAllocator<>* allocator = nullptr)
+        json* spmv_case = nullptr)
     {
         auto storage_logger = std::make_shared<StorageLogger>();
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->add_logger(storage_logger);
         }
 
         auto mtx =
             gko::share(::formats::matrix_factory(format_name, exec, data));
 
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->remove_logger(storage_logger);
-            storage_logger->write_data(*spmv_case, *allocator);
+            storage_logger->write_data(*spmv_case);
         }
 
         return mtx;
@@ -172,32 +189,51 @@ struct DistributedDefaultSystemGenerator {
     using Vec = dist_vec<value_type>;
 
     gko::matrix_data<value_type, index_type> generate_matrix_data(
-        rapidjson::Value& config) const
+        const json& config) const
     {
-        if (config.HasMember("filename")) {
-            std::ifstream in(config["filename"].GetString());
+        if (config.contains("filename")) {
+            std::ifstream in(config["filename"].get<std::string>());
             return gko::read_generic_raw<value_type, index_type>(in);
-        } else if (config.HasMember("stencil")) {
+        } else if (config.contains("stencil")) {
             auto local_size = static_cast<global_itype>(
-                config["size"].GetInt64() / comm.size());
+                config["size"].get<gko::int64>() / comm.size());
             return generate_stencil<value_type, index_type>(
-                config["stencil"].GetString(), comm, local_size,
-                config["comm_pattern"].GetString() == std::string("optimal"));
+                config["stencil"].get<std::string>(), comm, local_size,
+                config["comm_pattern"].get<std::string>() ==
+                    std::string("optimal"));
         } else {
             throw std::runtime_error(
                 "No known way to generate matrix data found.");
         }
     }
 
-    std::string describe_config(rapidjson::Value& config) const
+    static std::string get_example_config()
     {
-        if (config.HasMember("filename")) {
-            return config["filename"].GetString();
-        } else if (config.HasMember("stencil")) {
+        return json::
+            parse(R"([{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, {"filename": "my_file.mtx"}])")
+                .dump(4);
+    }
+
+    static bool validate_config(const json& test_case)
+    {
+        return ((test_case.contains("size") && test_case.contains("stencil") &&
+                 test_case.contains("comm_pattern") &&
+                 test_case["size"].is_number_integer() &&
+                 test_case["stencil"].is_string() &&
+                 test_case["comm_pattern"].is_string()) ||
+                (test_case.contains("filename") &&
+                 test_case["filename"].is_string()));
+    }
+
+    static std::string describe_config(const json& config)
+    {
+        if (config.contains("filename")) {
+            return config["filename"].get<std::string>();
+        } else if (config.contains("stencil")) {
             std::stringstream ss;
-            ss << "stencil(" << config["size"].GetInt64() << ","
-               << config["stencil"].GetString() << ","
-               << config["comm_pattern"].GetString() << ")";
+            ss << "stencil(" << config["size"].get<gko::int64>() << ","
+               << config["stencil"].get<std::string>() << ","
+               << config["comm_pattern"].get<std::string>() << ")";
             return ss.str();
         } else {
             throw std::runtime_error("No known way to describe config.");
@@ -205,29 +241,33 @@ struct DistributedDefaultSystemGenerator {
     }
 
     std::shared_ptr<gko::LinOp> generate_matrix_with_optimal_format(
-        std::shared_ptr<gko::Executor> exec, rapidjson::Value& config) const
+        std::shared_ptr<gko::Executor> exec, json& config) const
     {
         auto data = generate_matrix_data(config);
         return generate_matrix_with_format(
-            std::move(exec), config["optimal"]["spmv"].GetString(), data);
+            std::move(exec), config["optimal"]["spmv"].get<std::string>(),
+            data);
     }
 
     std::shared_ptr<gko::LinOp> generate_matrix_with_format(
         std::shared_ptr<gko::Executor> exec, const std::string& format_name,
         const gko::matrix_data<value_type, index_type>& data,
-        rapidjson::Value* spmv_case = nullptr,
-        rapidjson::MemoryPoolAllocator<>* allocator = nullptr) const
+        json* spmv_case = nullptr) const
     {
         auto part = gko::experimental::distributed::
             Partition<itype, global_itype>::build_from_global_size_uniform(
                 exec, comm.size(), static_cast<global_itype>(data.size[0]));
         auto formats = split(format_name, '-');
+        if (formats.size() != 2) {
+            throw std::runtime_error{"Invalid distributed format specifier " +
+                                     format_name};
+        }
 
         auto local_mat = formats::matrix_type_factory.at(formats[0])(exec);
         auto non_local_mat = formats::matrix_type_factory.at(formats[1])(exec);
 
         auto storage_logger = std::make_shared<StorageLogger>();
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->add_logger(storage_logger);
         }
 
@@ -235,9 +275,9 @@ struct DistributedDefaultSystemGenerator {
             exec, comm, local_mat, non_local_mat);
         dist_mat->read_distributed(data, part);
 
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->remove_logger(storage_logger);
-            storage_logger->write_data(comm, *spmv_case, *allocator);
+            storage_logger->write_data(comm, *spmv_case);
         }
 
         return dist_mat;
diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp
new file mode 100644
index 00000000000..295ae7870d6
--- /dev/null
+++ b/benchmark/utils/iteration_control.hpp
@@ -0,0 +1,326 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
+#define GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <memory>
+#include <string>
+#include <utility>
+
+
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+#include "core/distributed/helpers.hpp"
+
+
+/**
+ * A class for controlling the number warmup and timed iterations.
+ *
+ * The behavior is determined by the following flags
+ * - 'repetitions' switch between fixed and adaptive number of iterations
+ * - 'warmup' warmup iterations, applies in fixed and adaptive case
+ * - 'min_repetitions' minimal number of repetitions (adaptive case)
+ * - 'max_repetitions' maximal number of repetitions (adaptive case)
+ * - 'min_runtime' minimal total runtime (adaptive case)
+ * - 'repetition_growth_factor' controls the increase between two successive
+ *   timings
+ *
+ * Usage:
+ * `IterationControl` exposes the member functions:
+ * - `warmup_run()`: controls run defined by `warmup` flag
+ * - `run(bool)`: controls run defined by all other flags
+ * - `get_timer()`: access to underlying timer
+ * The first two methods return an object that is to be used in a range-based
+ * for loop:
+ * ```
+ * IterationControl ic(get_timer(...));
+ *
+ * // warmup run always uses fixed number of iteration and does not issue
+ * // timings
+ * for(auto status: ic.warmup_run()){
+ *   // execute benchmark
+ * }
+ * // run may use adaptive number of iterations (depending on cmd line flag)
+ * // and issues timing (unless manage_timings is false)
+ * for(auto status: ic.run(manage_timings [default is true])){
+ *   if(! manage_timings) ic.get_timer->tic();
+ *   // execute benchmark
+ *   if(! manage_timings) ic.get_timer->toc();
+ * }
+ *
+ * ```
+ * At the beginning of both methods, the timer is reset.
+ * The `status` object exposes the member
+ * - `cur_it`, containing the current iteration number,
+ * and the methods
+ * - `is_finished`, checks if the benchmark is finished,
+ */
+class IterationControl {
+    using IndexType = unsigned int;  //!< to be compatible with GFLAGS type
+
+    class run_control;
+
+public:
+    /**
+     * Creates an `IterationControl` object.
+     *
+     * Uses the commandline flags to setup the stopping criteria for the
+     * warmup and timed run.
+     *
+     * @param timer  the timer that is to be used for the timings
+     */
+    explicit IterationControl(const std::shared_ptr<Timer>& timer)
+    {
+        status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup,
+                          FLAGS_warmup, 0., 0};
+        if (FLAGS_repetitions == "auto") {
+            status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions,
+                           FLAGS_max_repetitions, FLAGS_min_runtime};
+        } else {
+            const auto reps =
+                static_cast<unsigned int>(std::stoi(FLAGS_repetitions));
+            status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0};
+        }
+    }
+
+    IterationControl() = default;
+    IterationControl(const IterationControl&) = default;
+    IterationControl(IterationControl&&) = default;
+
+    /**
+     * Creates iterable `run_control` object for the warmup run.
+     *
+     * This run uses always a fixed number of iterations.
+     */
+    run_control warmup_run()
+    {
+        status_warmup_.cur_it = 0;
+        status_warmup_.managed_timer.clear();
+        return run_control{&status_warmup_};
+    }
+
+    /**
+     * Creates iterable `run_control` object for the timed run.
+     *
+     * This run may be adaptive, depending on the commandline flags.
+     *
+     * @param manage_timings If true, the timer calls (`tic/toc`) are handled
+     * by the `run_control` object, otherwise they need to be executed outside
+     */
+    run_control run(bool manage_timings = true)
+    {
+        status_run_.cur_it = 0;
+        status_run_.managed_timer.clear();
+        status_run_.managed_timer.manage_timings = manage_timings;
+        return run_control{&status_run_};
+    }
+
+    std::shared_ptr<Timer> get_timer() const
+    {
+        return status_run_.managed_timer.timer;
+    }
+
+    /**
+     * Compute the time from the given statistical method
+     *
+     * @param method  the statistical method. If the timer does not have the
+     *                same iteration as the IterationControl, it can only use
+     *                average from the IterationControl.
+     *
+     * @return the statistical time
+     */
+    double compute_time(const std::string& method = "average") const
+    {
+        if (status_run_.managed_timer.timer->get_num_repetitions() ==
+            this->get_num_repetitions()) {
+            return status_run_.managed_timer.compute_time(method);
+        } else {
+            assert(method == "average");
+            return status_run_.managed_timer.get_total_time() /
+                   this->get_num_repetitions();
+        }
+    }
+
+    IndexType get_num_repetitions() const { return status_run_.cur_it; }
+
+private:
+    struct TimerManager {
+        std::shared_ptr<Timer> timer;
+        bool manage_timings = false;
+
+        void tic()
+        {
+            if (manage_timings) {
+                timer->tic();
+            }
+        }
+        void toc(unsigned int num = 1)
+        {
+            if (manage_timings) {
+                timer->toc(num);
+            }
+        }
+
+        void clear() { timer->clear(); }
+
+        double get_total_time() const { return timer->get_total_time(); }
+
+        double compute_time(const std::string& method = "average") const
+        {
+            return timer->compute_time(method);
+        }
+    };
+
+    /**
+     * Stores stopping criteria of the adaptive benchmark run as well as the
+     * current iteration number.
+     */
+    struct status {
+        TimerManager managed_timer{};
+
+        IndexType min_it = 0;
+        IndexType max_it = 0;
+        double max_runtime = 0.;
+
+        IndexType cur_it = 0;
+
+        /**
+         * checks if the adaptive run is complete
+         *
+         * the adaptive run is complete if:
+         * - the minimum number of iteration is reached
+         * - and either:
+         *   - the maximum number of repetitions is reached
+         *   - the total runtime is above the threshold
+         *
+         * @return completeness state of the adaptive run
+         */
+        bool is_finished() const
+        {
+            return cur_it >= min_it &&
+                   (cur_it >= max_it ||
+                    managed_timer.get_total_time() >= max_runtime);
+        }
+    };
+
+    /**
+     * Iterable class managing the benchmark iteration.
+     *
+     * Has to be used in a range-based for loop.
+     */
+    struct run_control {
+        struct iterator {
+            /**
+             * Increases the current iteration count and finishes timing if
+             * necessary.
+             *
+             * As `++it` is the last step of a for-loop, the managed_timer is
+             * stopped, if enough iterations have passed since the last timing.
+             * The interval between two timings is steadily increased to
+             * reduce the timing overhead.
+             */
+            iterator operator++()
+            {
+                cur_info->cur_it++;
+                if (cur_info->cur_it >= next_timing && !stopped) {
+                    cur_info->managed_timer.toc(
+                        static_cast<unsigned>(cur_info->cur_it - start_timing));
+                    stopped = true;
+                    next_timing = static_cast<IndexType>(std::ceil(
+                        next_timing * FLAGS_repetition_growth_factor));
+                    // If repetition_growth_factor <= 1, next_timing will be
+                    // next iteration.
+                    if (next_timing <= cur_info->cur_it) {
+                        next_timing = cur_info->cur_it + 1;
+                    }
+                }
+                return *this;
+            }
+
+            status operator*() const { return *cur_info; }
+
+            /**
+             * Checks if the benchmark is finished and handles timing, if
+             * necessary.
+             *
+             * As `begin != end` is the first step in a for-loop, the
+             * managed_timer is started, if it was previously stopped.
+             * Additionally, if the benchmark is complete and the managed_timer
+             * is still running it is stopped. (This may occur if the maximal
+             * number of repetitions is surpassed)
+             *
+             * Uses only the information from the `status` object, i.e.
+             * the right hand side is ignored.
+             *
+             * @return true if benchmark is not finished, else false
+             */
+            bool operator!=(const iterator&)
+            {
+                const bool is_finished = cur_info->is_finished();
+                if (!is_finished && stopped) {
+                    stopped = false;
+                    cur_info->managed_timer.tic();
+                    start_timing = cur_info->cur_it;
+                } else if (is_finished && !stopped) {
+                    cur_info->managed_timer.toc(
+                        static_cast<unsigned>(cur_info->cur_it - start_timing));
+                    stopped = true;
+                }
+                return !is_finished;
+            }
+
+            status* cur_info;
+            IndexType next_timing = 1;   //!< next iteration to stop timing
+            IndexType start_timing = 0;  //!< iteration for starting timing
+            bool stopped = true;
+        };
+
+        iterator begin() const { return iterator{info}; }
+
+        // not used, could potentially be used in c++17 as a sentinel
+        iterator end() const { return iterator{}; }
+
+        status* info;
+    };
+
+    status status_warmup_;
+    status status_run_;
+};
+
+
+#endif  // GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
diff --git a/benchmark/utils/json.hpp b/benchmark/utils/json.hpp
index b0cd384cae5..684db0229aa 100644
--- a/benchmark/utils/json.hpp
+++ b/benchmark/utils/json.hpp
@@ -34,69 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_BENCHMARK_UTILS_JSON_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
+#include <nlohmann/json.hpp>
 
 
-#include <type_traits>
-
-
-#include <rapidjson/document.h>
-#include <rapidjson/istreamwrapper.h>
-#include <rapidjson/ostreamwrapper.h>
-#include <rapidjson/prettywriter.h>
-
-
-// helper for setting rapidjson object members
-template <typename T, typename NameType, typename Allocator>
-std::enable_if_t<
-    !std::is_same<typename std::decay<T>::type, gko::size_type>::value, void>
-add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value,
-                  Allocator&& allocator)
-{
-    if (object.HasMember(name)) {
-        object[name] = std::forward<T>(value);
-    } else {
-        auto n = rapidjson::Value(name, allocator);
-        object.AddMember(n, std::forward<T>(value), allocator);
-    }
-}
-
-
-/**
-   @internal This is required to fix some MacOS problems (and possibly other
-   compilers). There is no explicit RapidJSON constructor for `std::size_t` so a
-   conversion to a known constructor is required to solve any ambiguity. See the
-   last comments of https://github.com/ginkgo-project/ginkgo/issues/270.
- */
-template <typename T, typename NameType, typename Allocator>
-std::enable_if_t<
-    std::is_same<typename std::decay<T>::type, gko::size_type>::value, void>
-add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value,
-                  Allocator&& allocator)
-{
-    if (object.HasMember(name)) {
-        object[name] =
-            std::forward<std::uint64_t>(static_cast<std::uint64_t>(value));
-    } else {
-        auto n = rapidjson::Value(name, allocator);
-        object.AddMember(
-            n, std::forward<std::uint64_t>(static_cast<std::uint64_t>(value)),
-            allocator);
-    }
-}
-
-
-// helper for writing out rapidjson Values
-inline std::ostream& operator<<(std::ostream& os, const rapidjson::Value& value)
-{
-    rapidjson::OStreamWrapper jos(os);
-    rapidjson::PrettyWriter<rapidjson::OStreamWrapper, rapidjson::UTF8<>,
-                            rapidjson::UTF8<>, rapidjson::CrtAllocator,
-                            rapidjson::kWriteNanAndInfFlag>
-        writer(jos);
-    value.Accept(writer);
-    return os;
-}
+using json = nlohmann::ordered_json;
 
 
 #endif  // GKO_BENCHMARK_UTILS_JSON_HPP_
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index e3e6228604e..1e651811f0f 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -50,10 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter,
                            gko::log::ProfilerHook::NestedSummaryWriter {
-    JsonSummaryWriter(rapidjson::Value& object,
-                      rapidjson::MemoryPoolAllocator<>& alloc,
-                      gko::uint32 repetitions)
-        : object{&object}, alloc{&alloc}, repetitions{repetitions}
+    JsonSummaryWriter(json& object, gko::uint32 repetitions)
+        : object{&object}, repetitions{repetitions}
     {}
 
     void write(
@@ -62,13 +60,11 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter,
     {
         for (const auto& entry : entries) {
             if (entry.name != "total") {
-                add_or_set_member(*object, entry.name.c_str(),
-                                  entry.exclusive.count() * 1e-9 / repetitions,
-                                  *alloc);
+                (*object)[entry.name] =
+                    entry.exclusive.count() * 1e-9 / repetitions;
             }
         }
-        add_or_set_member(*object, "overhead",
-                          overhead.count() * 1e-9 / repetitions, *alloc);
+        (*object)["overhead"] = overhead.count() * 1e-9 / repetitions;
     }
 
     void write_nested(const gko::log::ProfilerHook::nested_summary_entry& root,
@@ -84,27 +80,24 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter,
                 visit(visit, child, new_prefix);
                 exclusive -= child.elapsed;
             }
-            add_or_set_member(*object, (prefix + node.name).c_str(),
-                              exclusive.count() * 1e-9 / repetitions, *alloc);
+            (*object)[prefix + node.name] =
+                exclusive.count() * 1e-9 / repetitions;
         };
         // we don't need to annotate the total
         for (const auto& child : root.children) {
             visit(visit, child, "");
         }
-        add_or_set_member(*object, "overhead",
-                          overhead.count() * 1e-9 / repetitions, *alloc);
+        (*object)["overhead"] = overhead.count() * 1e-9 / repetitions;
     }
 
-    rapidjson::Value* object;
-    rapidjson::MemoryPoolAllocator<>* alloc;
+    json* object;
     gko::uint32 repetitions;
 };
 
 
 inline std::shared_ptr<gko::log::ProfilerHook> create_operations_logger(
     bool gpu_timer, bool nested, std::shared_ptr<gko::Executor> exec,
-    rapidjson::Value& object, rapidjson::MemoryPoolAllocator<>& alloc,
-    gko::uint32 repetitions)
+    json& object, gko::uint32 repetitions)
 {
     std::shared_ptr<gko::Timer> timer;
     if (gpu_timer) {
@@ -114,12 +107,10 @@ inline std::shared_ptr<gko::log::ProfilerHook> create_operations_logger(
     }
     if (nested) {
         return gko::log::ProfilerHook::create_nested_summary(
-            timer,
-            std::make_unique<JsonSummaryWriter>(object, alloc, repetitions));
+            timer, std::make_unique<JsonSummaryWriter>(object, repetitions));
     } else {
         return gko::log::ProfilerHook::create_summary(
-            timer,
-            std::make_unique<JsonSummaryWriter>(object, alloc, repetitions));
+            timer, std::make_unique<JsonSummaryWriter>(object, repetitions));
     }
 }
 
@@ -140,21 +131,18 @@ struct StorageLogger : gko::log::Logger {
         storage[location] = 0;
     }
 
-    void write_data(rapidjson::Value& output,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
+    void write_data(json& output)
     {
         const std::lock_guard<std::mutex> lock(mutex);
         gko::size_type total{};
         for (const auto& e : storage) {
             total += e.second;
         }
-        add_or_set_member(output, "storage", total, allocator);
+        output["storage"] = total;
     }
 
 #if GINKGO_BUILD_MPI
-    void write_data(gko::experimental::mpi::communicator comm,
-                    rapidjson::Value& output,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
+    void write_data(gko::experimental::mpi::communicator comm, json& output)
     {
         const std::lock_guard<std::mutex> lock(mutex);
         gko::size_type total{};
@@ -166,7 +154,7 @@ struct StorageLogger : gko::log::Logger {
                         ? static_cast<gko::size_type*>(MPI_IN_PLACE)
                         : &total,
                     &total, 1, MPI_SUM, 0);
-        add_or_set_member(output, "storage", total, allocator);
+        output["storage"] = total;
     }
 #endif
 
@@ -191,17 +179,16 @@ struct ResidualLogger : gko::log::Logger {
                                const gko::array<gko::stopping_status>* status,
                                bool all_stopped) const override
     {
-        timestamps.PushBack(std::chrono::duration<double>(
-                                std::chrono::steady_clock::now() - start)
-                                .count(),
-                            alloc);
+        timestamps.push_back(std::chrono::duration<double>(
+                                 std::chrono::steady_clock::now() - start)
+                                 .count());
         if (residual_norm) {
-            rec_res_norms.PushBack(
-                get_norm(gko::as<vec<rc_vtype>>(residual_norm)), alloc);
+            rec_res_norms.push_back(
+                get_norm(gko::as<vec<rc_vtype>>(residual_norm)));
         } else {
             gko::detail::vector_dispatch<rc_vtype>(
                 residual, [&](const auto v_residual) {
-                    rec_res_norms.PushBack(compute_norm2(v_residual), alloc);
+                    rec_res_norms.push_back(compute_norm2(v_residual));
                 });
         }
         if (solution) {
@@ -209,32 +196,25 @@ struct ResidualLogger : gko::log::Logger {
                 rc_vtype>(solution, [&](auto v_solution) {
                 using concrete_type =
                     std::remove_pointer_t<std::decay_t<decltype(v_solution)>>;
-                true_res_norms.PushBack(
-                    compute_residual_norm(matrix, gko::as<concrete_type>(b),
-                                          v_solution),
-                    alloc);
+                true_res_norms.push_back(compute_residual_norm(
+                    matrix, gko::as<concrete_type>(b), v_solution));
             });
         } else {
-            true_res_norms.PushBack(-1.0, alloc);
+            true_res_norms.push_back(-1.0);
         }
         if (implicit_sq_residual_norm) {
-            implicit_res_norms.PushBack(
-                std::sqrt(get_norm(
-                    gko::as<vec<rc_vtype>>(implicit_sq_residual_norm))),
-                alloc);
+            implicit_res_norms.push_back(std::sqrt(
+                get_norm(gko::as<vec<rc_vtype>>(implicit_sq_residual_norm))));
             has_implicit_res_norm = true;
         } else {
-            implicit_res_norms.PushBack(-1.0, alloc);
+            implicit_res_norms.push_back(-1.0);
         }
     }
 
     ResidualLogger(gko::ptr_param<const gko::LinOp> matrix,
-                   gko::ptr_param<const gko::LinOp> b,
-                   rapidjson::Value& rec_res_norms,
-                   rapidjson::Value& true_res_norms,
-                   rapidjson::Value& implicit_res_norms,
-                   rapidjson::Value& timestamps,
-                   rapidjson::MemoryPoolAllocator<>& alloc)
+                   gko::ptr_param<const gko::LinOp> b, json& rec_res_norms,
+                   json& true_res_norms, json& implicit_res_norms,
+                   json& timestamps)
         : gko::log::Logger(gko::log::Logger::iteration_complete_mask),
           matrix{matrix.get()},
           b{b.get()},
@@ -243,8 +223,7 @@ struct ResidualLogger : gko::log::Logger {
           true_res_norms{true_res_norms},
           has_implicit_res_norm{},
           implicit_res_norms{implicit_res_norms},
-          timestamps{timestamps},
-          alloc{alloc}
+          timestamps{timestamps}
     {}
 
     bool has_implicit_res_norms() const { return has_implicit_res_norm; }
@@ -253,12 +232,11 @@ struct ResidualLogger : gko::log::Logger {
     const gko::LinOp* matrix;
     const gko::LinOp* b;
     std::chrono::steady_clock::time_point start;
-    rapidjson::Value& rec_res_norms;
-    rapidjson::Value& true_res_norms;
+    json& rec_res_norms;
+    json& true_res_norms;
     mutable bool has_implicit_res_norm;
-    rapidjson::Value& implicit_res_norms;
-    rapidjson::Value& timestamps;
-    rapidjson::MemoryPoolAllocator<>& alloc;
+    json& implicit_res_norms;
+    json& timestamps;
 };
 
 
@@ -279,11 +257,7 @@ struct IterationLogger : gko::log::Logger {
         : gko::log::Logger(gko::log::Logger::iteration_complete_mask)
     {}
 
-    void write_data(rapidjson::Value& output,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
-    {
-        add_or_set_member(output, "iterations", this->num_iters, allocator);
-    }
+    void write_data(json& output) { output["iterations"] = this->num_iters; }
 
 private:
     mutable gko::size_type num_iters{0};
diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp
new file mode 100644
index 00000000000..3520f7299ee
--- /dev/null
+++ b/benchmark/utils/runner.hpp
@@ -0,0 +1,209 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_RUNNER_HPP_
+#define GKO_BENCHMARK_UTILS_RUNNER_HPP_
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+
+#include "benchmark/utils/general.hpp"
+
+
+std::shared_ptr<gko::log::ProfilerHook> create_profiler_hook(
+    std::shared_ptr<const gko::Executor> exec, bool do_print = true)
+{
+    using gko::log::ProfilerHook;
+    std::map<std::string, std::function<std::shared_ptr<ProfilerHook>()>>
+        hook_map{
+            {"none", [] { return std::shared_ptr<ProfilerHook>{}; }},
+            {"auto", [&] { return ProfilerHook::create_for_executor(exec); }},
+            {"nvtx", [] { return ProfilerHook::create_nvtx(); }},
+            {"roctx", [] { return ProfilerHook::create_roctx(); }},
+            {"tau", [] { return ProfilerHook::create_tau(); }},
+            {"vtune", [] { return ProfilerHook::create_vtune(); }},
+            {"debug", [do_print] {
+                 return ProfilerHook::create_custom(
+                     [do_print](const char* name,
+                                gko::log::profile_event_category) {
+                         if (do_print) {
+                             std::clog << "DEBUG: begin " << name << '\n';
+                         }
+                     },
+                     [do_print](const char* name,
+                                gko::log::profile_event_category) {
+                         if (do_print) {
+                             std::clog << "DEBUG: end   " << name << '\n';
+                         }
+                     });
+             }}};
+    return hook_map.at(FLAGS_profiler_hook)();
+}
+
+
+template <typename State>
+struct Benchmark {
+    /** The name to be used in the JSON output. */
+    virtual const std::string& get_name() const = 0;
+
+    /** The operations to loop over for each test case. */
+    virtual const std::vector<std::string>& get_operations() const = 0;
+
+    /** Should we write logging output? */
+    virtual bool should_print() const = 0;
+
+    /** Example JSON input */
+    virtual std::string get_example_config() const = 0;
+
+    /** Is the input test case in the correct format? */
+    virtual bool validate_config(const json& value) const = 0;
+
+    /** Textual representation of the test case for profiler annotation */
+    virtual std::string describe_config(const json& test_case) const = 0;
+
+    /** Sets up shared state and test case info */
+    virtual State setup(std::shared_ptr<gko::Executor> exec,
+                        json& test_case) const = 0;
+
+    /** Runs a single operation of the benchmark */
+    virtual void run(std::shared_ptr<gko::Executor> exec,
+                     std::shared_ptr<Timer> timer, State& state,
+                     const std::string& operation,
+                     json& operation_case) const = 0;
+
+    /** Post-process test case info. */
+    virtual void postprocess(json& test_case) const {}
+};
+
+
+template <typename State>
+void run_test_cases(const Benchmark<State>& benchmark,
+                    std::shared_ptr<gko::Executor> exec,
+                    std::shared_ptr<Timer> timer, json& test_cases)
+{
+    if (!test_cases.is_array()) {
+        if (benchmark.should_print()) {
+            std::cerr
+                << "Input has to be a JSON array of benchmark configurations:\n"
+                << benchmark.get_example_config() << std::endl;
+        }
+        std::exit(1);
+    }
+    for (const auto& test_case : test_cases) {
+        if (!test_case.is_object() || !benchmark.validate_config(test_case)) {
+            if (benchmark.should_print()) {
+                std::cerr << "Invalid test case:\n"
+                          << std::setw(4) << test_case << "\nInput format:\n"
+                          << benchmark.get_example_config() << std::endl;
+            }
+            std::exit(2);
+        }
+    }
+
+    auto profiler_hook = create_profiler_hook(exec, benchmark.should_print());
+    if (profiler_hook) {
+        exec->add_logger(profiler_hook);
+    }
+    auto annotate =
+        [profiler_hook](const char* name) -> gko::log::profiling_scope_guard {
+        if (profiler_hook) {
+            return profiler_hook->user_range(name);
+        }
+        return {};
+    };
+
+    for (auto& test_case : test_cases) {
+        try {
+            // set up benchmark
+            if (!test_case.contains(benchmark.get_name())) {
+                test_case[benchmark.get_name()] = json::object();
+            }
+            if (benchmark.should_print()) {
+                std::clog << "Running test case\n"
+                          << std::setw(4) << test_case << std::endl;
+            }
+            auto test_case_state = benchmark.setup(exec, test_case);
+            auto test_case_str = benchmark.describe_config(test_case);
+            auto test_case_range = annotate(test_case_str.c_str());
+            auto& benchmark_case = test_case[benchmark.get_name()];
+            for (const auto& operation_name : benchmark.get_operations()) {
+                if (benchmark_case.contains(operation_name) &&
+                    !FLAGS_overwrite) {
+                    continue;
+                }
+                benchmark_case[operation_name] = json::object();
+                if (benchmark.should_print()) {
+                    std::clog << "\tRunning " << benchmark.get_name() << ": "
+                              << operation_name << std::endl;
+                }
+                auto& operation_case = benchmark_case[operation_name];
+                try {
+                    auto operation_range = annotate(operation_name.c_str());
+                    benchmark.run(exec, timer, test_case_state, operation_name,
+                                  operation_case);
+                    operation_case["completed"] = true;
+                } catch (const std::exception& e) {
+                    operation_case["completed"] = false;
+                    operation_case["error_type"] =
+                        gko::name_demangling::get_dynamic_type(e);
+                    operation_case["error"] = e.what();
+                    std::cerr << "Error when processing test case\n"
+                              << std::setw(4) << test_case << "\n"
+                              << "what(): " << e.what() << std::endl;
+                }
+
+                if (benchmark.should_print()) {
+                    backup_results(test_cases);
+                }
+            }
+            benchmark.postprocess(test_case);
+        } catch (const std::exception& e) {
+            std::cerr << "Error setting up benchmark, what(): " << e.what()
+                      << std::endl;
+            test_case["error_type"] = gko::name_demangling::get_dynamic_type(e);
+            test_case["error"] = e.what();
+        }
+    }
+
+    if (profiler_hook) {
+        exec->remove_logger(profiler_hook);
+    }
+}
+
+
+#endif  // GKO_BENCHMARK_UTILS_RUNNER_HPP_
diff --git a/benchmark/utils/spmv_validation.hpp b/benchmark/utils/spmv_validation.hpp
deleted file mode 100644
index 83ea2085ec2..00000000000
--- a/benchmark/utils/spmv_validation.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#ifndef GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
-#define GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
-
-
-#include <ginkgo/ginkgo.hpp>
-
-
-#include <cstdlib>
-#include <iostream>
-
-
-#include <rapidjson/document.h>
-
-
-std::string example_config = R"(
-  [
-    {"filename": "my_file.mtx"},
-    {"filename": "my_file2.mtx"},
-    {"size": 100, "stencil": "7pt"},
-  ]
-)";
-
-
-/**
- * Function which outputs the input format for benchmarks similar to the spmv.
- */
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of matrix configurations:\n"
-              << example_config << std::endl;
-    std::exit(1);
-}
-
-
-/**
- * Validates whether the input format is correct for spmv-like benchmarks.
- *
- * @param value  the JSON value to test.
- */
-void validate_option_object(const rapidjson::Value& value)
-{
-    if (!value.IsObject() ||
-        !((value.HasMember("size") && value.HasMember("stencil") &&
-           value["size"].IsInt64() && value["stencil"].IsString()) ||
-          (value.HasMember("filename") && value["filename"].IsString()))) {
-        print_config_error_and_exit();
-    }
-}
-
-
-#endif  // GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index a54d4d506ee..828f95bc8ca 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -14,8 +14,8 @@ if(GINKGO_BUILD_BENCHMARKS)
     if (NOT gflags_FOUND)
         add_subdirectory(gflags)
     endif()
-    if (NOT RapidJSON_FOUND)
-        add_subdirectory(rapidjson)
+    if (NOT nlohmann_json_FOUND)
+        add_subdirectory(nlohmann_json)
     endif()
 endif()
 
diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt
new file mode 100644
index 00000000000..77064c66c40
--- /dev/null
+++ b/third_party/nlohmann_json/CMakeLists.txt
@@ -0,0 +1,9 @@
+message(STATUS "Fetching external nlohmann_json")
+include(FetchContent)
+FetchContent_Declare(
+    nlohmann_json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG        bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d
+)
+set(JSON_BuildTests OFF CACHE INTERNAL "")
+FetchContent_MakeAvailable(nlohmann_json)
diff --git a/third_party/rapidjson/CMakeLists.txt b/third_party/rapidjson/CMakeLists.txt
deleted file mode 100644
index a96b90cb882..00000000000
--- a/third_party/rapidjson/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-message(STATUS "Fetching external RapidJSON")
-include(FetchContent)
-FetchContent_Declare(
-    rapidjson
-    GIT_REPOSITORY https://github.com/Tencent/rapidjson.git
-    GIT_TAG        27c3a8dc0e2c9218fe94986d249a12b5ed838f1d
-)
-FetchContent_GetProperties(rapidjson)
-if(NOT rapidjson_POPULATED)
-    FetchContent_Populate(rapidjson)
-endif()
-set(RapidJSON_INCLUDE_DIR "${rapidjson_SOURCE_DIR}/include")
-add_library(rapidjson INTERFACE)
-set_target_properties(rapidjson PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${RapidJSON_INCLUDE_DIR}")

From c1cee359b333d3b8438db4598c9afb79b89e3478 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 27 Jul 2023 23:45:43 +0200
Subject: [PATCH 02/13] add distributed tests again

This reverts commit 0dab7626e920bfdf32a2285ff5741da1e36404cb.
Additionally replaces the JSON test case output by their description
---
 benchmark/test/CMakeLists.txt                 |   4 +-
 benchmark/test/input.distributed_mtx.json     |   7 +
 benchmark/test/multi_vector_distributed.py    |  38 ++
 benchmark/test/reference/blas.profile.stderr  |   6 +-
 benchmark/test/reference/blas.simple.stderr   |   6 +-
 .../test/reference/conversion.all.stderr      |   7 +-
 .../test/reference/conversion.profile.stderr  |   7 +-
 .../test/reference/conversion.simple.stderr   |   7 +-
 .../distributed_solver.profile.stderr         |  11 +-
 .../distributed_solver.simple.stderr          |  11 +-
 .../reference/matrix_statistics.simple.stderr |   7 +-
 .../multi_vector_distributed.profile.stderr   | 254 ++++++++++
 .../multi_vector_distributed.profile.stdout   |  29 ++
 .../multi_vector_distributed.simple.stderr    |  10 +
 .../multi_vector_distributed.simple.stdout    |  29 ++
 .../reference/preconditioner.profile.stderr   |   7 +-
 .../reference/preconditioner.simple.stderr    |   7 +-
 .../test/reference/solver.profile.stderr      |  10 +-
 benchmark/test/reference/solver.simple.stderr |  10 +-
 .../test/reference/sparse_blas.profile.stderr |   7 +-
 .../test/reference/sparse_blas.simple.stderr  |   7 +-
 benchmark/test/reference/spmv.profile.stderr  |   7 +-
 benchmark/test/reference/spmv.simple.stderr   |   7 +-
 .../reference/spmv_distributed.profile.stderr | 446 ++++++++++++++++++
 .../reference/spmv_distributed.profile.stdout |  22 +
 .../reference/spmv_distributed.simple.stderr  |  10 +
 .../reference/spmv_distributed.simple.stdout  |  23 +
 benchmark/test/spmv_distributed.py            |  42 ++
 benchmark/test/test_framework.py.in           |   2 +-
 benchmark/utils/general.hpp                   |  39 --
 benchmark/utils/runner.hpp                    |  10 +-
 31 files changed, 935 insertions(+), 154 deletions(-)
 create mode 100644 benchmark/test/input.distributed_mtx.json
 create mode 100644 benchmark/test/multi_vector_distributed.py
 create mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stderr
 create mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stdout
 create mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stderr
 create mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stdout
 create mode 100644 benchmark/test/reference/spmv_distributed.profile.stderr
 create mode 100644 benchmark/test/reference/spmv_distributed.profile.stdout
 create mode 100644 benchmark/test/reference/spmv_distributed.simple.stderr
 create mode 100644 benchmark/test/reference/spmv_distributed.simple.stdout
 create mode 100644 benchmark/test/spmv_distributed.py

diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt
index e1aab6dd75d..1cd589927fa 100644
--- a/benchmark/test/CMakeLists.txt
+++ b/benchmark/test/CMakeLists.txt
@@ -22,5 +22,7 @@ add_benchmark_test(solver)
 add_benchmark_test(sparse_blas)
 add_benchmark_test(spmv)
 if (GINKGO_BUILD_MPI)
+    add_benchmark_test(multi_vector_distributed)
+    add_benchmark_test(spmv_distributed)
     add_benchmark_test(solver_distributed)
-endif()
+endif()
\ No newline at end of file
diff --git a/benchmark/test/input.distributed_mtx.json b/benchmark/test/input.distributed_mtx.json
new file mode 100644
index 00000000000..aca115179e6
--- /dev/null
+++ b/benchmark/test/input.distributed_mtx.json
@@ -0,0 +1,7 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil"
+    }
+]
\ No newline at end of file
diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py
new file mode 100644
index 00000000000..1e0c4c8adf5
--- /dev/null
+++ b/benchmark/test/multi_vector_distributed.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output_distributed(
+    ["-input", '[{"n": 100}]'],
+    expected_stdout="multi_vector_distributed.simple.stdout",
+    expected_stderr="multi_vector_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# stdin
+test_framework.compare_output_distributed(
+    [],
+    expected_stdout="multi_vector_distributed.simple.stdout",
+    expected_stderr="multi_vector_distributed.simple.stderr",
+    stdin='[{"n": 100}]',
+    num_procs=3,
+)
+
+# file
+test_framework.compare_output_distributed(
+    ["-input", str(test_framework.sourcepath / "input.blas.json")],
+    expected_stdout="multi_vector_distributed.simple.stdout",
+    expected_stderr="multi_vector_distributed.simple.stderr",
+    stdin='[{"n": 100}]',
+    num_procs=3,
+)
+
+# profiler annotations
+test_framework.compare_output_distributed(
+    ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"],
+    expected_stdout="multi_vector_distributed.profile.stdout",
+    expected_stderr="multi_vector_distributed.profile.stderr",
+    stdin='[{"n": 100}]',
+    num_procs=3,
+)
diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr
index b64f4321287..1313c85e462 100644
--- a/benchmark/test/reference/blas.profile.stderr
+++ b/benchmark/test/reference/blas.profile.stderr
@@ -4,11 +4,7 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The operations are copy,axpy,scal
-Running test case
-{
-    "n": 100,
-    "blas": {}
-}
+Running test case n = 100 
 DEBUG: begin n = 100 
 	Running blas: copy
 DEBUG: begin copy
diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr
index f41b25c6ee1..966ed597166 100644
--- a/benchmark/test/reference/blas.simple.stderr
+++ b/benchmark/test/reference/blas.simple.stderr
@@ -4,11 +4,7 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are copy,axpy,scal
-Running test case
-{
-    "n": 100,
-    "blas": {}
-}
+Running test case n = 100 
 	Running blas: copy
 	Running blas: axpy
 	Running blas: scal
diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr
index 1d5df7477ba..77ff50a1b89 100644
--- a/benchmark/test/reference/conversion.all.stderr
+++ b/benchmark/test/reference/conversion.all.stderr
@@ -4,12 +4,7 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr,ell,sellp,hybrid
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "conversion": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 	Running conversion: coo-read
 	Running conversion: coo-csr
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index 089e6be02f9..6078dd3db2f 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -4,12 +4,7 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "conversion": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 DEBUG: begin stencil(100,7pt)
 	Running conversion: coo-read
diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr
index a814dba6888..9b51effac09 100644
--- a/benchmark/test/reference/conversion.simple.stderr
+++ b/benchmark/test/reference/conversion.simple.stderr
@@ -4,12 +4,7 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "conversion": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 	Running conversion: coo-read
 	Running conversion: coo-csr
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
index e583a1411a8..1daab773a38 100644
--- a/benchmark/test/reference/distributed_solver.profile.stderr
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -5,16 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "comm_pattern": "stencil",
-    "optimal": {
-        "spmv": "csr-csr"
-    },
-    "solver": {}
-}
+Running test case stencil(100,7pt,stencil)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr
index 9feb7fa9522..607081a3949 100644
--- a/benchmark/test/reference/distributed_solver.simple.stderr
+++ b/benchmark/test/reference/distributed_solver.simple.stderr
@@ -5,15 +5,6 @@ Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "comm_pattern": "stencil",
-    "optimal": {
-        "spmv": "csr-csr"
-    },
-    "solver": {}
-}
+Running test case stencil(100,7pt,stencil)
 Matrix is of size (125, 125)
 	Running solver: cg
diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr
index 75a7cca709f..d02edbc44da 100644
--- a/benchmark/test/reference/matrix_statistics.simple.stderr
+++ b/benchmark/test/reference/matrix_statistics.simple.stderr
@@ -1,9 +1,4 @@
 This is Ginkgo 1.7.0 (develop)
     running with core module 1.7.0 (develop)
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "problem": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr
new file mode 100644
index 00000000000..a77484daacb
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr
@@ -0,0 +1,254 @@
+This is Ginkgo 1.7.0 (develop)
+    running with core module 1.7.0 (develop)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+DEBUG: begin n = 100 
+	Running blas: copy
+DEBUG: begin copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   copy
+	Running blas: axpy
+DEBUG: begin axpy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::add_scaled
+DEBUG: end   dense::add_scaled
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   axpy
+	Running blas: scal
+DEBUG: begin scal
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::scale
+DEBUG: end   dense::scale
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   scal
+DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout
new file mode 100644
index 00000000000..3a2e7e54f80
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout
@@ -0,0 +1,29 @@
+
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr
new file mode 100644
index 00000000000..966ed597166
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (develop)
+    running with core module 1.7.0 (develop)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+	Running blas: copy
+	Running blas: axpy
+	Running blas: scal
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout
new file mode 100644
index 00000000000..08e692727fe
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout
@@ -0,0 +1,29 @@
+
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
index c215b22c925..def3a83993d 100644
--- a/benchmark/test/reference/preconditioner.profile.stderr
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -4,12 +4,7 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "preconditioner": {}
-}
+Running test case stencil(100,7pt)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::fill_array
diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr
index 07d2cca6704..0090e180d2b 100644
--- a/benchmark/test/reference/preconditioner.simple.stderr
+++ b/benchmark/test/reference/preconditioner.simple.stderr
@@ -4,11 +4,6 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "preconditioner": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 	Running preconditioner: none
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
index 0c3f7060796..43ff852f68e 100644
--- a/benchmark/test/reference/solver.profile.stderr
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -5,15 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "optimal": {
-        "spmv": "csr"
-    },
-    "solver": {}
-}
+Running test case stencil(100,7pt)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::fill_array
diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr
index c5e4267a6bd..659dd026588 100644
--- a/benchmark/test/reference/solver.simple.stderr
+++ b/benchmark/test/reference/solver.simple.stderr
@@ -5,14 +5,6 @@ Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "optimal": {
-        "spmv": "csr"
-    },
-    "solver": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125)
 	Running solver: cg
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
index d1434dad146..c47ce2a515b 100644
--- a/benchmark/test/reference/sparse_blas.profile.stderr
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -4,12 +4,7 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "sparse_blas": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 DEBUG: begin allocate
 DEBUG: end   allocate
diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr
index 452374a9268..1f2bb34809f 100644
--- a/benchmark/test/reference/sparse_blas.simple.stderr
+++ b/benchmark/test/reference/sparse_blas.simple.stderr
@@ -4,11 +4,6 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "sparse_blas": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
index 09a10b725ea..4ff0125782f 100644
--- a/benchmark/test/reference/spmv.profile.stderr
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -5,12 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo
 The number of right hand sides is 1
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "spmv": {}
-}
+Running test case stencil(100,7pt)
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin allocate
diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr
index a910512ff31..9d5047febb6 100644
--- a/benchmark/test/reference/spmv.simple.stderr
+++ b/benchmark/test/reference/spmv.simple.stderr
@@ -5,11 +5,6 @@ Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo
 The number of right hand sides is 1
-Running test case
-{
-    "size": 100,
-    "stencil": "7pt",
-    "spmv": {}
-}
+Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
 	Running spmv: coo
diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr
new file mode 100644
index 00000000000..95a07c8275c
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.profile.stderr
@@ -0,0 +1,446 @@
+This is Ginkgo 1.7.0 (develop)
+    running with core module 1.7.0 (develop)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The formats are [csr]x[csr]
+The number of right hand sides is 1
+Running test case stencil(100,7pt,stencil)
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+Matrix is of size (81, 81), 144
+DEBUG: begin stencil(100,7pt,stencil)
+	Running spmv: csr-csr
+DEBUG: begin csr-csr
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin distributed_matrix::build_local_nonlocal
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   distributed_matrix::build_local_nonlocal
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin copy(<typename>)
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin apply(<typename>)
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: end   csr-csr
+DEBUG: end   stencil(100,7pt,stencil)
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout
new file mode 100644
index 00000000000..ebacddb887c
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.profile.stdout
@@ -0,0 +1,22 @@
+
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "spmv": {
+            "csr-csr": {
+                "storage": 6420,
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 81,
+        "cols": 81,
+        "nonzeros": 144,
+        "optimal": {
+            "spmv": "csr-csr"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr
new file mode 100644
index 00000000000..0df742d5b9b
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (develop)
+    running with core module 1.7.0 (develop)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are [csr]x[csr]
+The number of right hand sides is 1
+Running test case stencil(100,7pt,stencil)
+Matrix is of size (81, 81), 144
+	Running spmv: csr-csr
diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout
new file mode 100644
index 00000000000..64203476f91
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.simple.stdout
@@ -0,0 +1,23 @@
+
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "spmv": {
+            "csr-csr": {
+                "storage": 6420,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 81,
+        "cols": 81,
+        "nonzeros": 144,
+        "optimal": {
+            "spmv": "csr-csr"
+        }
+    }
+]
diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py
new file mode 100644
index 00000000000..356db48459e
--- /dev/null
+++ b/benchmark/test/spmv_distributed.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output_distributed(
+    ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'],
+    expected_stdout="spmv_distributed.simple.stdout",
+    expected_stderr="spmv_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# stdin
+test_framework.compare_output_distributed(
+    [],
+    expected_stdout="spmv_distributed.simple.stdout",
+    expected_stderr="spmv_distributed.simple.stderr",
+    num_procs=3,
+    stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]',
+)
+
+# input file
+test_framework.compare_output_distributed(
+    ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")],
+    expected_stdout="spmv_distributed.simple.stdout",
+    expected_stderr="spmv_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# profiler annotations
+test_framework.compare_output_distributed(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="spmv_distributed.profile.stdout",
+    expected_stderr="spmv_distributed.profile.stderr",
+    num_procs=3,
+)
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index da1b0bfd618..faf898a21cb 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -247,7 +247,7 @@ def compare_output(
 def compare_output_distributed(
     args, expected_stdout, expected_stderr, num_procs, stdin=""
 ):
-    compare_output(
+    compare_output_impl(
         args,
         expected_stdout,
         expected_stderr,
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 41acb560ba1..1c48680f883 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -245,45 +245,6 @@ std::shared_ptr<gko::log::ProfilerHook> create_profiler_hook(
 }
 
 
-struct owning_profiling_scope_guard {
-    std::string name;
-    gko::log::profiling_scope_guard guard;
-
-    owning_profiling_scope_guard() = default;
-
-    owning_profiling_scope_guard(std::string name_,
-                                 gko::log::ProfilerHook* profiler_hook)
-        : name(std::move(name_)), guard{profiler_hook->user_range(name.c_str())}
-    {}
-};
-
-
-struct annotate_functor {
-    owning_profiling_scope_guard operator()(std::string name) const
-    {
-        if (profiler_hook) {
-            return owning_profiling_scope_guard{std::move(name),
-                                                profiler_hook.get()};
-        }
-        return {};
-    }
-
-    gko::log::profiling_scope_guard operator()(const char* name) const
-    {
-        if (profiler_hook) {
-            return profiler_hook->user_range(name);
-        }
-        return {};
-    }
-
-    annotate_functor(std::shared_ptr<gko::log::ProfilerHook> profiler_hook)
-        : profiler_hook{std::move(profiler_hook)}
-    {}
-
-    std::shared_ptr<gko::log::ProfilerHook> profiler_hook;
-};
-
-
 // Returns a random number engine
 std::default_random_engine& get_engine()
 {
diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp
index 3520f7299ee..661c403706f 100644
--- a/benchmark/utils/runner.hpp
+++ b/benchmark/utils/runner.hpp
@@ -153,13 +153,13 @@ void run_test_cases(const Benchmark<State>& benchmark,
             if (!test_case.contains(benchmark.get_name())) {
                 test_case[benchmark.get_name()] = json::object();
             }
+            auto test_case_desc = benchmark.describe_config(test_case);
             if (benchmark.should_print()) {
-                std::clog << "Running test case\n"
-                          << std::setw(4) << test_case << std::endl;
+                std::clog << "Running test case " << test_case_desc
+                          << std::endl;
             }
             auto test_case_state = benchmark.setup(exec, test_case);
-            auto test_case_str = benchmark.describe_config(test_case);
-            auto test_case_range = annotate(test_case_str.c_str());
+            auto test_case_range = annotate(test_case_desc.c_str());
             auto& benchmark_case = test_case[benchmark.get_name()];
             for (const auto& operation_name : benchmark.get_operations()) {
                 if (benchmark_case.contains(operation_name) &&
@@ -183,7 +183,7 @@ void run_test_cases(const Benchmark<State>& benchmark,
                         gko::name_demangling::get_dynamic_type(e);
                     operation_case["error"] = e.what();
                     std::cerr << "Error when processing test case\n"
-                              << std::setw(4) << test_case << "\n"
+                              << test_case_desc << "\n"
                               << "what(): " << e.what() << std::endl;
                 }
 

From ca3fccf3e9a03d63e3e2032556e8edd1543cea67 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 27 Jul 2023 23:44:32 +0200
Subject: [PATCH 03/13] handle JSON and non-JSON test output separately

---
 benchmark/test/reference/blas.profile.stdout  |  3 +-
 benchmark/test/reference/blas.simple.stdout   |  3 +-
 .../test/reference/conversion.all.stdout      |  3 +-
 .../test/reference/conversion.profile.stdout  |  3 +-
 .../test/reference/conversion.simple.stdout   |  3 +-
 .../distributed_solver.profile.stdout         |  3 +-
 .../distributed_solver.simple.stdout          |  3 +-
 .../reference/matrix_statistics.simple.stdout |  3 +-
 .../multi_vector_distributed.profile.stdout   |  3 +-
 .../multi_vector_distributed.simple.stdout    |  3 +-
 .../reference/preconditioner.profile.stdout   |  3 +-
 .../reference/preconditioner.simple.stdout    |  3 +-
 .../test/reference/solver.profile.stdout      |  3 +-
 benchmark/test/reference/solver.simple.stdout |  3 +-
 .../test/reference/sparse_blas.profile.stdout |  3 +-
 .../test/reference/sparse_blas.simple.stdout  |  3 +-
 benchmark/test/reference/spmv.profile.stdout  |  3 +-
 benchmark/test/reference/spmv.simple.stdout   |  3 +-
 .../reference/spmv_distributed.profile.stdout |  3 +-
 .../reference/spmv_distributed.simple.stdout  |  3 +-
 benchmark/test/test_framework.py.in           | 78 ++++++-------------
 21 files changed, 44 insertions(+), 94 deletions(-)

diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout
index 3a2e7e54f80..8998d5eaed7 100644
--- a/benchmark/test/reference/blas.profile.stdout
+++ b/benchmark/test/reference/blas.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "n": 100,
@@ -26,4 +25,4 @@
             }
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout
index 08e692727fe..a586a9bc57b 100644
--- a/benchmark/test/reference/blas.simple.stdout
+++ b/benchmark/test/reference/blas.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "n": 100,
@@ -26,4 +25,4 @@
             }
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout
index c4b657a42c4..0c77d464793 100644
--- a/benchmark/test/reference/conversion.all.stdout
+++ b/benchmark/test/reference/conversion.all.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -74,4 +73,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout
index b29815f6c17..a9c3ea674fa 100644
--- a/benchmark/test/reference/conversion.profile.stdout
+++ b/benchmark/test/reference/conversion.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -29,4 +28,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout
index 856f1330eea..81c735789d1 100644
--- a/benchmark/test/reference/conversion.simple.stdout
+++ b/benchmark/test/reference/conversion.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -29,4 +28,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout
index aef92652256..55dfb1dc428 100644
--- a/benchmark/test/reference/distributed_solver.profile.stdout
+++ b/benchmark/test/reference/distributed_solver.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -31,4 +30,4 @@
         "rows": 125,
         "cols": 125
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout
index 002b9d91347..eed8d864388 100644
--- a/benchmark/test/reference/distributed_solver.simple.stdout
+++ b/benchmark/test/reference/distributed_solver.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -57,4 +56,4 @@
         "rows": 125,
         "cols": 125
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout
index 13746ce8a46..923bbc9f962 100644
--- a/benchmark/test/reference/matrix_statistics.simple.stdout
+++ b/benchmark/test/reference/matrix_statistics.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -38,4 +37,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout
index 3a2e7e54f80..8998d5eaed7 100644
--- a/benchmark/test/reference/multi_vector_distributed.profile.stdout
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "n": 100,
@@ -26,4 +25,4 @@
             }
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout
index 08e692727fe..a586a9bc57b 100644
--- a/benchmark/test/reference/multi_vector_distributed.simple.stdout
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "n": 100,
@@ -26,4 +25,4 @@
             }
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout
index f53407d818d..e33a6502eea 100644
--- a/benchmark/test/reference/preconditioner.profile.stdout
+++ b/benchmark/test/reference/preconditioner.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -22,4 +21,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout
index 92bb51ddb57..06291228a1c 100644
--- a/benchmark/test/reference/preconditioner.simple.stdout
+++ b/benchmark/test/reference/preconditioner.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -30,4 +29,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout
index 0148e6ef092..906c74de5e7 100644
--- a/benchmark/test/reference/solver.profile.stdout
+++ b/benchmark/test/reference/solver.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -30,4 +29,4 @@
         "rows": 125,
         "cols": 125
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout
index b4e7b56b2bf..5d127fe4b78 100644
--- a/benchmark/test/reference/solver.simple.stdout
+++ b/benchmark/test/reference/solver.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -54,4 +53,4 @@
         "rows": 125,
         "cols": 125
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout
index 848fb503ed4..e9d48fde23d 100644
--- a/benchmark/test/reference/sparse_blas.profile.stdout
+++ b/benchmark/test/reference/sparse_blas.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -16,4 +15,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout
index f39300ca35b..3cc5f774ebf 100644
--- a/benchmark/test/reference/sparse_blas.simple.stdout
+++ b/benchmark/test/reference/sparse_blas.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -23,4 +22,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout
index 5302d54f9f0..409a92d4e33 100644
--- a/benchmark/test/reference/spmv.profile.stdout
+++ b/benchmark/test/reference/spmv.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -18,4 +17,4 @@
             "spmv": "coo"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout
index 737938d7c96..9601a15b331 100644
--- a/benchmark/test/reference/spmv.simple.stdout
+++ b/benchmark/test/reference/spmv.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -19,4 +18,4 @@
             "spmv": "coo"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout
index ebacddb887c..8de6a68ae8a 100644
--- a/benchmark/test/reference/spmv_distributed.profile.stdout
+++ b/benchmark/test/reference/spmv_distributed.profile.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -19,4 +18,4 @@
             "spmv": "csr-csr"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout
index 64203476f91..f94e4b992a1 100644
--- a/benchmark/test/reference/spmv_distributed.simple.stdout
+++ b/benchmark/test/reference/spmv_distributed.simple.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "size": 100,
@@ -20,4 +19,4 @@
             "spmv": "csr-csr"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index faf898a21cb..3deb282297a 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -22,7 +22,8 @@ denumberify_paths = [
     "rhs_norm",
     "max_relative_norm2",
 ]
-empty_string_paths = ["error", "filename"]
+detypenameify_key_starts = ["generate(", "apply(", "advanced_apply(", "copy(", "check("]
+empty_string_paths = ["filename"]
 empty_array_paths = [
     "recurrent_residuals",
     "true_residuals",
@@ -31,6 +32,18 @@ empty_array_paths = [
 ]
 
 
+def sanitize_json_key(key: str):
+    """Applies sanitation to a single key.
+
+    Strings that start with a name in detypenameify_key_starts will be truncated
+    """
+
+    for start in detypenameify_key_starts:
+        if key.startswith(start):
+            return start + "<typename>)"
+    return key
+
+
 def sanitize_json_key_value(key: str, value, sanitize_all: bool):
     """Applies sanitation to a single key-value pair.
 
@@ -59,7 +72,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False):
 
     if isinstance(parsed_input, dict):
         return {
-            key: sanitize_json_key_value(key, value, sanitize_all)
+            sanitize_json_key(key): sanitize_json_key_value(key, value, sanitize_all)
             for key, value in parsed_input.items()
         }
     elif isinstance(parsed_input, list):
@@ -70,40 +83,15 @@ def sanitize_json(parsed_input, sanitize_all: bool = False):
         return parsed_input
 
 
-def sanitize_json_in_text(lines: List[str]) -> List[str]:
-    """Sanitizes all occurrences of JSON content inside text input.
+def determinize_json_text(input: str) -> List[str]:
+    """Sanitizes the given input JSON string.
 
-    Takes a list of text lines and detects any pretty-printed JSON output inside
-    (recognized by a single [, {, } or ] in an otherwise empty line).
-    The JSON output will be parsed and sanitized through sanitize_json(...)
+    The JSON values will be parsed and sanitized through sanitize_json(...)
     and pretty-printed to replace the original JSON input.
-    The function returns the resulting output.
     """
 
-    json_begins = [i for i, l in enumerate(lines) if l in ["[", "{"]]
-    json_ends = [i + 1 for i, l in enumerate(lines) if l in ["]", "}"]]
-    json_pairs = list(zip(json_begins, json_ends))
-    if len(json_pairs) == 0:
-        return lines
-    assert all(begin < end for begin, end in json_pairs)
-    nonjson_pairs = (
-        [(0, json_begins[0])]
-        + list(zip(json_ends[:-1], json_begins[1:]))
-        + [(json_ends[-1], len(lines))]
-    )
-    combined_pairs = sorted(
-        [(begin, end, False) for begin, end in nonjson_pairs]
-        + [(begin, end, True) for begin, end in json_pairs]
-    )
-    texts = [
-        ("\n".join(lines[begin:end]), do_sanitize)
-        for begin, end, do_sanitize in combined_pairs
-    ]
-    reconstructed = [
-        json.dumps(sanitize_json(json.loads(t)), indent=4) if do_sanitize else t
-        for t, do_sanitize in texts
-    ]
-    return "\n".join(reconstructed).split("\n")
+    result = json.dumps(sanitize_json(json.loads(input)), indent=4)
+    return result.splitlines()
 
 
 def determinize_text(
@@ -116,9 +104,6 @@ def determinize_text(
     Every input line matching an entry from ignore_patterns will be removed.
     Every line matching the first string in an entry from replace_patterns
     will be replaced by the second string.
-    Finally, the text will be passed to sanitize_json_in_text, which removes
-    nondeterministic parts from JSON objects/arrays in the input,
-    if it can be parsed correctly.
     The output is guaranteed to end with an empty line.
     """
 
@@ -137,10 +122,7 @@ def determinize_text(
             output_lines.append(line)
     if len(output_lines) == 0 or output_lines[-1] != "":
         output_lines.append("")
-    try:
-        return sanitize_json_in_text(output_lines)
-    except json.decoder.JSONDecodeError:
-        return output_lines
+    return output_lines
 
 
 def compare_output_impl(
@@ -173,13 +155,7 @@ def compare_output_impl(
     ]
     if generate:
         open(expected_stdout, "w").write(
-            "\n".join(
-                determinize_text(
-                    result.stdout.decode(),
-                    ignore_patterns=[],
-                    replace_patterns=typename_patterns,
-                )
-            )
+            "\n".join(determinize_json_text(result.stdout.decode()))
         )
         open(expected_stderr, "w").write(
             "\n".join(
@@ -192,19 +168,13 @@ def compare_output_impl(
         )
         print("GENERATED")
         return
-    result_stdout_processed = determinize_text(
-        result.stdout.decode(), ignore_patterns=[], replace_patterns=typename_patterns
-    )
+    result_stdout_processed = determinize_json_text(result.stdout.decode())
     result_stderr_processed = determinize_text(
         result.stderr.decode(),
         ignore_patterns=version_patterns,
         replace_patterns=typename_patterns,
     )
-    expected_stdout_processed = determinize_text(
-        open(expected_stdout).read(),
-        ignore_patterns=[],
-        replace_patterns=typename_patterns,
-    )
+    expected_stdout_processed = determinize_json_text(open(expected_stdout).read())
     expected_stderr_processed = determinize_text(
         open(expected_stderr).read(),
         ignore_patterns=version_patterns,

From d27b507474d04567a36adbdf9a103d9bdabaf5c9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 28 Jul 2023 00:15:00 +0200
Subject: [PATCH 04/13] benchmark reads on device_matrix_data

---
 benchmark/conversion/conversion.cpp           | 11 +++--
 .../test/reference/conversion.profile.stderr  | 46 +++++++++++++++----
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
index b9a5d5c46d6..5f03cb2b933 100644
--- a/benchmark/conversion/conversion.cpp
+++ b/benchmark/conversion/conversion.cpp
@@ -60,7 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 using Generator = DefaultSystemGenerator<>;
 
 
-struct ConversionBenchmark : Benchmark<gko::matrix_data<etype, itype>> {
+struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
     std::string name;
     std::vector<std::string> operations;
 
@@ -112,8 +112,8 @@ struct ConversionBenchmark : Benchmark<gko::matrix_data<etype, itype>> {
         return Generator::describe_config(test_case);
     }
 
-    gko::matrix_data<etype, itype> setup(std::shared_ptr<gko::Executor> exec,
-                                         json& test_case) const override
+    gko::device_matrix_data<etype, itype> setup(
+        std::shared_ptr<gko::Executor> exec, json& test_case) const override
     {
         gko::matrix_data<etype, itype> data;
         data = Generator::generate_matrix_data(test_case);
@@ -122,12 +122,13 @@ struct ConversionBenchmark : Benchmark<gko::matrix_data<etype, itype>> {
         test_case["rows"] = data.size[0];
         test_case["cols"] = data.size[1];
         test_case["nonzeros"] = data.nonzeros.size();
-        return data;
+        return gko::device_matrix_data<etype, itype>::create_from_host(exec,
+                                                                       data);
     }
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
-             gko::matrix_data<etype, itype>& data,
+             gko::device_matrix_data<etype, itype>& data,
              const std::string& operation_name,
              json& operation_case) const override
     {
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index 6078dd3db2f..ca80375c5bf 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -6,17 +6,29 @@ The random seed for right hand sides is 42
 The formats are coo,csr
 Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin allocate
+DEBUG: end   allocate
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
 DEBUG: begin stencil(100,7pt)
 	Running conversion: coo-read
 DEBUG: begin coo-read
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin components::aos_to_soa
-DEBUG: end   components::aos_to_soa
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin free
 DEBUG: end   free
 DEBUG: begin free
@@ -28,12 +40,16 @@ DEBUG: end   coo-read
 DEBUG: begin coo-csr
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin components::aos_to_soa
-DEBUG: end   components::aos_to_soa
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin components::fill_array
@@ -75,12 +91,16 @@ DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin components::aos_to_soa
-DEBUG: end   components::aos_to_soa
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin free
@@ -104,12 +124,16 @@ DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
-DEBUG: begin components::aos_to_soa
-DEBUG: end   components::aos_to_soa
+DEBUG: begin copy
+DEBUG: end   copy
 DEBUG: begin allocate
 DEBUG: end   allocate
 DEBUG: begin free
@@ -146,3 +170,9 @@ DEBUG: begin free
 DEBUG: end   free
 DEBUG: end   csr-coo
 DEBUG: end   stencil(100,7pt)
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free
+DEBUG: begin free
+DEBUG: end   free

From e3af0296829eedb9072b8c2d75bc129123837ece Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 17 Aug 2023 21:17:05 +0200
Subject: [PATCH 05/13] remove allocations from output

they are sometimes implementation-dependent
for libstdc++ types
---
 benchmark/test/reference/blas.profile.stderr  |  28 --
 .../test/reference/conversion.profile.stderr  | 104 ------
 .../distributed_solver.profile.stderr         | 232 -------------
 .../multi_vector_distributed.profile.stderr   | 128 --------
 .../reference/preconditioner.profile.stderr   |  44 ---
 .../test/reference/solver.profile.stderr      | 132 --------
 .../test/reference/sparse_blas.profile.stderr |  36 --
 benchmark/test/reference/spmv.profile.stderr  |  48 ---
 .../reference/spmv_distributed.profile.stderr | 308 ------------------
 benchmark/test/test_framework.py.in           |  11 +-
 10 files changed, 6 insertions(+), 1065 deletions(-)

diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr
index 1313c85e462..529fc16009c 100644
--- a/benchmark/test/reference/blas.profile.stderr
+++ b/benchmark/test/reference/blas.profile.stderr
@@ -8,27 +8,13 @@ Running test case n = 100
 DEBUG: begin n = 100 
 	Running blas: copy
 DEBUG: begin copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   copy
 	Running blas: axpy
 DEBUG: begin axpy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
@@ -37,28 +23,14 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::add_scaled
 DEBUG: end   dense::add_scaled
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   axpy
 	Running blas: scal
 DEBUG: begin scal
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::scale
 DEBUG: end   dense::scale
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   scal
 DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index ca80375c5bf..a233579c721 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -6,173 +6,69 @@ The random seed for right hand sides is 42
 The formats are coo,csr
 Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
 DEBUG: begin stencil(100,7pt)
 	Running conversion: coo-read
 DEBUG: begin coo-read
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   coo-read
 	Running conversion: coo-csr
 DEBUG: begin coo-csr
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
 DEBUG: end   copy(<typename>)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   coo-csr
 	Running conversion: csr-read
 DEBUG: begin csr-read
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   csr-read
 	Running conversion: csr-coo
 DEBUG: begin csr-coo
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::convert_ptrs_to_idxs
 DEBUG: end   components::convert_ptrs_to_idxs
 DEBUG: end   copy(<typename>)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   csr-coo
 DEBUG: end   stencil(100,7pt)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
index 1daab773a38..4ea20730117 100644
--- a/benchmark/test/reference/distributed_solver.profile.stderr
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -6,18 +6,8 @@ The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
 Running test case stencil(100,7pt,stencil)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -32,18 +22,10 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin copy(<typename>)
@@ -56,8 +38,6 @@ DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: end   copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin copy(<typename>)
@@ -70,135 +50,29 @@ DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: end   copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin distributed_matrix::build_local_nonlocal
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   distributed_matrix::build_local_nonlocal
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
@@ -206,62 +80,30 @@ Matrix is of size (125, 125)
 DEBUG: begin stencil(100,7pt,stencil)
 	Running solver: cg
 DEBUG: begin cg
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::compute_squared_norm2
 DEBUG: end   dense::compute_squared_norm2
 DEBUG: begin dense::compute_sqrt
 DEBUG: end   dense::compute_sqrt
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin generate(<typename>)
 DEBUG: begin generate(<typename>)
 DEBUG: end   generate(<typename>)
 DEBUG: end   generate(<typename>)
 DEBUG: begin apply(<typename>)
 DEBUG: begin iteration
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin cg::initialize
 DEBUG: end   cg::initialize
 DEBUG: begin advanced_apply(<typename>)
@@ -276,20 +118,10 @@ DEBUG: begin csr::advanced_spmv
 DEBUG: end   csr::advanced_spmv
 DEBUG: end   advanced_apply(<typename>)
 DEBUG: end   advanced_apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::compute_squared_norm2
 DEBUG: end   dense::compute_squared_norm2
 DEBUG: begin dense::compute_sqrt
 DEBUG: end   dense::compute_sqrt
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin apply(<typename>)
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
@@ -586,25 +418,9 @@ DEBUG: begin residual_norm::residual_norm
 DEBUG: end   residual_norm::residual_norm
 DEBUG: end   check(<typename>)
 DEBUG: end   check(<typename>)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   iteration
 DEBUG: end   apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
@@ -620,59 +436,11 @@ DEBUG: begin csr::advanced_spmv
 DEBUG: end   csr::advanced_spmv
 DEBUG: end   advanced_apply(<typename>)
 DEBUG: end   advanced_apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::compute_squared_norm2
 DEBUG: end   dense::compute_squared_norm2
 DEBUG: begin dense::compute_sqrt
 DEBUG: end   dense::compute_sqrt
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   cg
 DEBUG: end   stencil(100,7pt,stencil)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr
index a77484daacb..102330e38f4 100644
--- a/benchmark/test/reference/multi_vector_distributed.profile.stderr
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr
@@ -8,18 +8,8 @@ Running test case n = 100
 DEBUG: begin n = 100 
 	Running blas: copy
 DEBUG: begin copy
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -34,32 +24,10 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -74,45 +42,17 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   copy
 	Running blas: axpy
 DEBUG: begin axpy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -127,32 +67,10 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -167,20 +85,8 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
@@ -189,29 +95,11 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::add_scaled
 DEBUG: end   dense::add_scaled
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   axpy
 	Running blas: scal
 DEBUG: begin scal
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -226,29 +114,13 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::scale
 DEBUG: end   dense::scale
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   scal
 DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
index def3a83993d..610dfe464ec 100644
--- a/benchmark/test/reference/preconditioner.profile.stderr
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -5,50 +5,20 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
 Running test case stencil(100,7pt)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
@@ -58,8 +28,6 @@ DEBUG: begin stencil(100,7pt)
 	Running preconditioner: none
 DEBUG: begin none
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
@@ -71,17 +39,5 @@ DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 DEBUG: end   apply(<typename>)
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   none
 DEBUG: end   stencil(100,7pt)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
index 43ff852f68e..238591eb0c9 100644
--- a/benchmark/test/reference/solver.profile.stderr
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -6,49 +6,19 @@ The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
 Running test case stencil(100,7pt)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
@@ -56,78 +26,36 @@ Matrix is of size (125, 125)
 DEBUG: begin stencil(100,7pt)
 	Running solver: cg
 DEBUG: begin cg
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::compute_norm2_dispatch
 DEBUG: end   dense::compute_norm2_dispatch
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin generate(<typename>)
 DEBUG: begin generate(<typename>)
 DEBUG: end   generate(<typename>)
 DEBUG: end   generate(<typename>)
 DEBUG: begin apply(<typename>)
 DEBUG: begin iteration
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin cg::initialize
 DEBUG: end   cg::initialize
 DEBUG: begin advanced_apply(<typename>)
 DEBUG: begin csr::advanced_spmv
 DEBUG: end   csr::advanced_spmv
 DEBUG: end   advanced_apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::compute_norm2_dispatch
 DEBUG: end   dense::compute_norm2_dispatch
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin apply(<typename>)
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
@@ -352,25 +280,9 @@ DEBUG: begin residual_norm::residual_norm
 DEBUG: end   residual_norm::residual_norm
 DEBUG: end   check(<typename>)
 DEBUG: end   check(<typename>)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   iteration
 DEBUG: end   apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
@@ -378,53 +290,9 @@ DEBUG: begin advanced_apply(<typename>)
 DEBUG: begin csr::advanced_spmv
 DEBUG: end   csr::advanced_spmv
 DEBUG: end   advanced_apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::compute_norm2_dispatch
 DEBUG: end   dense::compute_norm2_dispatch
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   cg
 DEBUG: end   stencil(100,7pt)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
index c47ce2a515b..60cf41ccbae 100644
--- a/benchmark/test/reference/sparse_blas.profile.stderr
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -6,54 +6,18 @@ The random seed for right hand sides is 42
 The operations are transpose
 Running test case stencil(100,7pt)
 Matrix is of size (125, 125), 725
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin stencil(100,7pt)
 	Running sparse_blas: transpose
 DEBUG: begin transpose
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin csr::transpose
 DEBUG: end   csr::transpose
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   transpose
 DEBUG: end   stencil(100,7pt)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
index 4ff0125782f..2299614c6c4 100644
--- a/benchmark/test/reference/spmv.profile.stderr
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -6,61 +6,25 @@ The random seed for right hand sides is 42
 The formats are coo
 The number of right hand sides is 1
 Running test case stencil(100,7pt)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 Matrix is of size (125, 125), 725
 DEBUG: begin stencil(100,7pt)
 	Running spmv: coo
 DEBUG: begin coo
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
@@ -68,17 +32,5 @@ DEBUG: begin apply(<typename>)
 DEBUG: begin coo::spmv
 DEBUG: end   coo::spmv
 DEBUG: end   apply(<typename>)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   coo
 DEBUG: end   stencil(100,7pt)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr
index 95a07c8275c..b44cef7f3f6 100644
--- a/benchmark/test/reference/spmv_distributed.profile.stderr
+++ b/benchmark/test/reference/spmv_distributed.profile.stderr
@@ -6,18 +6,8 @@ The random seed for right hand sides is 42
 The formats are [csr]x[csr]
 The number of right hand sides is 1
 Running test case stencil(100,7pt,stencil)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -32,50 +22,16 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -90,54 +46,20 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 Matrix is of size (81, 81), 144
 DEBUG: begin stencil(100,7pt,stencil)
 	Running spmv: csr-csr
 DEBUG: begin csr-csr
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::fill_array
@@ -152,18 +74,10 @@ DEBUG: begin partition::build_starting_indices
 DEBUG: end   partition::build_starting_indices
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin copy(<typename>)
@@ -176,8 +90,6 @@ DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: end   copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin copy(<typename>)
@@ -190,219 +102,27 @@ DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: end   copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin distributed_matrix::build_local_nonlocal
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   distributed_matrix::build_local_nonlocal
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy
 DEBUG: end   copy
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: begin copy(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 DEBUG: begin apply(<typename>)
-DEBUG: begin allocate
-DEBUG: end   allocate
-DEBUG: begin allocate
-DEBUG: end   allocate
 DEBUG: begin dense::row_gather
 DEBUG: end   dense::row_gather
 DEBUG: begin apply(<typename>)
@@ -414,33 +134,5 @@ DEBUG: begin csr::advanced_spmv
 DEBUG: end   csr::advanced_spmv
 DEBUG: end   advanced_apply(<typename>)
 DEBUG: end   apply(<typename>)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
 DEBUG: end   csr-csr
 DEBUG: end   stencil(100,7pt,stencil)
-DEBUG: begin free
-DEBUG: end   free
-DEBUG: begin free
-DEBUG: end   free
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 3deb282297a..014d3cb41a5 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -146,8 +146,9 @@ def compare_output_impl(
             " ".join(["'{}'".format(arg) for arg in launcher_flags + args])
         )
     )
-    version_patterns = [
-        "    the .* module is",
+    ignore_patterns = [
+        "    the .* module is",  # version numbers
+        "DEBUG: (begin|end  ) (allocate|free)",  # allocations
     ]
     typename_patterns = [
         ("(apply|generate|check|copy|move)\([^())]*\)", "\\1(<typename>)"),
@@ -161,7 +162,7 @@ def compare_output_impl(
             "\n".join(
                 determinize_text(
                     result.stderr.decode(),
-                    ignore_patterns=version_patterns,
+                    ignore_patterns=ignore_patterns,
                     replace_patterns=typename_patterns,
                 )
             )
@@ -171,13 +172,13 @@ def compare_output_impl(
     result_stdout_processed = determinize_json_text(result.stdout.decode())
     result_stderr_processed = determinize_text(
         result.stderr.decode(),
-        ignore_patterns=version_patterns,
+        ignore_patterns=ignore_patterns,
         replace_patterns=typename_patterns,
     )
     expected_stdout_processed = determinize_json_text(open(expected_stdout).read())
     expected_stderr_processed = determinize_text(
         open(expected_stderr).read(),
-        ignore_patterns=version_patterns,
+        ignore_patterns=ignore_patterns,
         replace_patterns=typename_patterns,
     )
     failed = False

From 9cd278ce9aee94e4a70db750390b68aa6b3d93a2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 17 Aug 2023 21:32:01 +0200
Subject: [PATCH 06/13] update matrix outputs

---
 benchmark/test/reference/conversion.matrix.stderr        | 6 +-----
 benchmark/test/reference/conversion.matrix.stdout        | 3 +--
 .../test/reference/distributed_solver.matrix.stderr      | 9 +--------
 .../test/reference/distributed_solver.matrix.stdout      | 3 +--
 benchmark/test/reference/matrix_statistics.matrix.stderr | 6 +-----
 benchmark/test/reference/matrix_statistics.matrix.stdout | 3 +--
 benchmark/test/reference/preconditioner.matrix.stderr    | 6 +-----
 benchmark/test/reference/preconditioner.matrix.stdout    | 3 +--
 benchmark/test/reference/solver.matrix.stderr            | 9 +--------
 benchmark/test/reference/solver.matrix.stdout            | 3 +--
 benchmark/test/reference/sparse_blas.matrix.stderr       | 6 +-----
 benchmark/test/reference/sparse_blas.matrix.stdout       | 3 +--
 benchmark/test/reference/spmv.matrix.stderr              | 6 +-----
 benchmark/test/reference/spmv.matrix.stdout              | 3 +--
 benchmark/test/test_framework.py.in                      | 1 +
 15 files changed, 15 insertions(+), 55 deletions(-)

diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr
index 369a363a53e..5e7bd1cce24 100644
--- a/benchmark/test/reference/conversion.matrix.stderr
+++ b/benchmark/test/reference/conversion.matrix.stderr
@@ -4,11 +4,7 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Running test case
-{
-    "filename": "",
-    "conversion": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36), 208
 	Running conversion: coo-read
 	Running conversion: coo-csr
diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout
index 7e537fa4919..7f27b0c25b3 100644
--- a/benchmark/test/reference/conversion.matrix.stdout
+++ b/benchmark/test/reference/conversion.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -28,4 +27,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr
index 4f0c6b22edd..cd2bb49261c 100644
--- a/benchmark/test/reference/distributed_solver.matrix.stderr
+++ b/benchmark/test/reference/distributed_solver.matrix.stderr
@@ -5,13 +5,6 @@ Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case
-{
-    "filename": "",
-    "optimal": {
-        "spmv": "csr-csr"
-    },
-    "solver": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36)
 	Running solver: cg
diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout
index cd3c7b8bd43..ec1d258e2f4 100644
--- a/benchmark/test/reference/distributed_solver.matrix.stdout
+++ b/benchmark/test/reference/distributed_solver.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -55,4 +54,4 @@
         "rows": 36,
         "cols": 36
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr
index 7bb33842f25..0b31ef3a888 100644
--- a/benchmark/test/reference/matrix_statistics.matrix.stderr
+++ b/benchmark/test/reference/matrix_statistics.matrix.stderr
@@ -1,8 +1,4 @@
 This is Ginkgo 1.7.0 (develop)
     running with core module 1.7.0 (develop)
-Running test case
-{
-    "filename": "",
-    "problem": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36), 208
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout
index ea73587fde4..a6297e89b66 100644
--- a/benchmark/test/reference/matrix_statistics.matrix.stdout
+++ b/benchmark/test/reference/matrix_statistics.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -37,4 +36,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr
index 4088a20c925..7452ab91b3a 100644
--- a/benchmark/test/reference/preconditioner.matrix.stderr
+++ b/benchmark/test/reference/preconditioner.matrix.stderr
@@ -4,10 +4,6 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
-Running test case
-{
-    "filename": "",
-    "preconditioner": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36), 208
 	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout
index 0415a87ea8d..51adb7383c3 100644
--- a/benchmark/test/reference/preconditioner.matrix.stdout
+++ b/benchmark/test/reference/preconditioner.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -29,4 +28,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr
index 8a1ea117314..cd2bb49261c 100644
--- a/benchmark/test/reference/solver.matrix.stderr
+++ b/benchmark/test/reference/solver.matrix.stderr
@@ -5,13 +5,6 @@ Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case
-{
-    "filename": "",
-    "optimal": {
-        "spmv": "csr"
-    },
-    "solver": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36)
 	Running solver: cg
diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout
index 56577288c2d..a87e78f7f66 100644
--- a/benchmark/test/reference/solver.matrix.stdout
+++ b/benchmark/test/reference/solver.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -53,4 +52,4 @@
         "rows": 36,
         "cols": 36
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr
index ff52b6a3269..483429fd71d 100644
--- a/benchmark/test/reference/sparse_blas.matrix.stderr
+++ b/benchmark/test/reference/sparse_blas.matrix.stderr
@@ -4,10 +4,6 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
-Running test case
-{
-    "filename": "",
-    "sparse_blas": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36), 208
 	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout
index 4a64c8ea1ce..74fdbf98e7a 100644
--- a/benchmark/test/reference/sparse_blas.matrix.stdout
+++ b/benchmark/test/reference/sparse_blas.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -22,4 +21,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr
index a618da5b321..45beba6cafb 100644
--- a/benchmark/test/reference/spmv.matrix.stderr
+++ b/benchmark/test/reference/spmv.matrix.stderr
@@ -5,10 +5,6 @@ Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo
 The number of right hand sides is 1
-Running test case
-{
-    "filename": "",
-    "spmv": {}
-}
+Running test case <filename>
 Matrix is of size (36, 36), 208
 	Running spmv: coo
diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout
index dc30ab6b284..4d03ce3cd07 100644
--- a/benchmark/test/reference/spmv.matrix.stdout
+++ b/benchmark/test/reference/spmv.matrix.stdout
@@ -1,4 +1,3 @@
-
 [
     {
         "filename": "",
@@ -18,4 +17,4 @@
             "spmv": "coo"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 014d3cb41a5..6e3092bde6c 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -153,6 +153,7 @@ def compare_output_impl(
     typename_patterns = [
         ("(apply|generate|check|copy|move)\([^())]*\)", "\\1(<typename>)"),
         ("what\\(\\): .*", "what(): <removed>"),
+        (re.escape(str(matrixpath)), "<filename>"),
     ]
     if generate:
         open(expected_stdout, "w").write(

From 8adf765865c03f261c155ed1e1db50550b7eef2c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sat, 19 Aug 2023 11:27:22 +0200
Subject: [PATCH 07/13] review updates

- rename 'determinize' -> 'sanitize'
- use empty struct for empty benchmark state
- use version tag instead of commit ID
- use std::endl where appropriate

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 .../matrix_statistics/matrix_statistics.cpp   |  5 +++-
 benchmark/test/test_framework.py.in           | 30 +++++++++++--------
 benchmark/utils/general.hpp                   |  2 +-
 third_party/nlohmann_json/CMakeLists.txt      |  2 +-
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index 40c505c7627..4bb63032550 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -149,7 +149,10 @@ void extract_matrix_statistics(gko::matrix_data<etype, gko::int64>& data,
 using Generator = DefaultSystemGenerator<etype, gko::int64>;
 
 
-struct MatrixStatistics : Benchmark<int> {
+struct empty_state {};
+
+
+struct MatrixStatistics : Benchmark<empty_state> {
     std::string name;
     std::vector<std::string> empty;
 
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 6e3092bde6c..1a07818df1f 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -22,7 +22,8 @@ denumberify_paths = [
     "rhs_norm",
     "max_relative_norm2",
 ]
-detypenameify_key_starts = ["generate(", "apply(", "advanced_apply(", "copy(", "check("]
+detypenameify_key_starts = [
+    "generate(", "apply(", "advanced_apply(", "copy(", "check("]
 empty_string_paths = ["filename"]
 empty_array_paths = [
     "recurrent_residuals",
@@ -44,7 +45,7 @@ def sanitize_json_key(key: str):
     return key
 
 
-def sanitize_json_key_value(key: str, value, sanitize_all: bool):
+def sanitize_json_value(key: str, value, sanitize_all: bool):
     """Applies sanitation to a single key-value pair.
 
     Strings with a key in empty_string_paths will be emptied
@@ -72,7 +73,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False):
 
     if isinstance(parsed_input, dict):
         return {
-            sanitize_json_key(key): sanitize_json_key_value(key, value, sanitize_all)
+            sanitize_json_key(key): sanitize_json_value(key, value, sanitize_all)
             for key, value in parsed_input.items()
         }
     elif isinstance(parsed_input, list):
@@ -83,7 +84,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False):
         return parsed_input
 
 
-def determinize_json_text(input: str) -> List[str]:
+def sanitize_json_text(input: str) -> List[str]:
     """Sanitizes the given input JSON string.
 
     The JSON values will be parsed and sanitized through sanitize_json(...)
@@ -94,7 +95,7 @@ def determinize_json_text(input: str) -> List[str]:
     return result.splitlines()
 
 
-def determinize_text(
+def sanitize_text(
     input: str,
     ignore_patterns: List[str],
     replace_patterns: List[Tuple[str, str]],
@@ -157,11 +158,11 @@ def compare_output_impl(
     ]
     if generate:
         open(expected_stdout, "w").write(
-            "\n".join(determinize_json_text(result.stdout.decode()))
+            "\n".join(sanitize_json_text(result.stdout.decode()))
         )
         open(expected_stderr, "w").write(
             "\n".join(
-                determinize_text(
+                sanitize_text(
                     result.stderr.decode(),
                     ignore_patterns=ignore_patterns,
                     replace_patterns=typename_patterns,
@@ -170,14 +171,15 @@ def compare_output_impl(
         )
         print("GENERATED")
         return
-    result_stdout_processed = determinize_json_text(result.stdout.decode())
-    result_stderr_processed = determinize_text(
+    result_stdout_processed = sanitize_json_text(result.stdout.decode())
+    result_stderr_processed = sanitize_text(
         result.stderr.decode(),
         ignore_patterns=ignore_patterns,
         replace_patterns=typename_patterns,
     )
-    expected_stdout_processed = determinize_json_text(open(expected_stdout).read())
-    expected_stderr_processed = determinize_text(
+    expected_stdout_processed = sanitize_json_text(
+        open(expected_stdout).read())
+    expected_stderr_processed = sanitize_text(
         open(expected_stderr).read(),
         ignore_patterns=ignore_patterns,
         replace_patterns=typename_patterns,
@@ -187,7 +189,8 @@ def compare_output_impl(
         print("FAIL: stdout differs")
         print(
             "\n".join(
-                difflib.unified_diff(expected_stdout_processed, result_stdout_processed)
+                difflib.unified_diff(
+                    expected_stdout_processed, result_stdout_processed)
             )
         )
         failed = True
@@ -195,7 +198,8 @@ def compare_output_impl(
         print("FAIL: stderr differs")
         print(
             "\n".join(
-                difflib.unified_diff(expected_stderr_processed, result_stderr_processed)
+                difflib.unified_diff(
+                    expected_stderr_processed, result_stderr_processed)
             )
         )
         failed = True
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 1c48680f883..550f6fe2720 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -216,7 +216,7 @@ void print_general_information(const std::string& extra)
     }
     std::clog << "The random seed for right hand sides is " << FLAGS_seed
               << '\n'
-              << extra << '\n';
+              << extra << std::endl;
 }
 
 
diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt
index 77064c66c40..b95cfa5606a 100644
--- a/third_party/nlohmann_json/CMakeLists.txt
+++ b/third_party/nlohmann_json/CMakeLists.txt
@@ -3,7 +3,7 @@ include(FetchContent)
 FetchContent_Declare(
     nlohmann_json
     GIT_REPOSITORY https://github.com/nlohmann/json.git
-    GIT_TAG        bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d
+    GIT_TAG        v3.9.1
 )
 set(JSON_BuildTests OFF CACHE INTERNAL "")
 FetchContent_MakeAvailable(nlohmann_json)

From 8d52ec8a29c4c03af4d4cdbdf7d9dae06ccd64d7 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sat, 19 Aug 2023 12:19:00 +0200
Subject: [PATCH 08/13] annotate repetitions

---
 benchmark/blas/blas_common.hpp                | 17 +++++++-----
 benchmark/conversion/conversion.cpp           | 26 +++++++++++++------
 .../matrix_statistics/matrix_statistics.cpp   |  9 ++++---
 benchmark/preconditioner/preconditioner.cpp   | 13 +++++++---
 benchmark/solver/solver_common.hpp            | 21 +++++++++------
 benchmark/sparse_blas/sparse_blas.cpp         | 17 +++++++-----
 benchmark/spmv/spmv_common.hpp                | 16 +++++++-----
 benchmark/utils/general.hpp                   | 26 +++++++++++++++++++
 benchmark/utils/runner.hpp                    | 16 ++++--------
 9 files changed, 109 insertions(+), 52 deletions(-)

diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
index 88819a043b0..1267dc57c15 100644
--- a/benchmark/blas/blas_common.hpp
+++ b/benchmark/blas/blas_common.hpp
@@ -489,7 +489,8 @@ struct BlasBenchmark : Benchmark<dimensions> {
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
-             dimensions& dims, const std::string& operation_name,
+             annotate_functor annotate, dimensions& dims,
+             const std::string& operation_name,
              json& operation_case) const override
     {
         auto op = operation_map.at(operation_name)(exec, dims);
@@ -497,16 +498,20 @@ struct BlasBenchmark : Benchmark<dimensions> {
         IterationControl ic(timer);
 
         // warm run
-        for (auto _ : ic.warmup_run()) {
-            op->prepare();
-            exec->synchronize();
-            op->run();
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                op->prepare();
+                exec->synchronize();
+                op->run();
+                exec->synchronize();
+            }
         }
 
         // timed run
         op->prepare();
         for (auto _ : ic.run()) {
+            auto range = annotate("repetition");
             op->run();
         }
         const auto runtime = ic.compute_time(FLAGS_timer_method);
diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
index 5f03cb2b933..c777db1a35a 100644
--- a/benchmark/conversion/conversion.cpp
+++ b/benchmark/conversion/conversion.cpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
 #include "benchmark/utils/iteration_control.hpp"
@@ -128,6 +129,7 @@ struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate,
              gko::device_matrix_data<etype, itype>& data,
              const std::string& operation_name,
              json& operation_case) const override
@@ -142,13 +144,17 @@ struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
         IterationControl ic{timer};
         if (to_name == "read") {
             // warm run
-            for (auto _ : ic.warmup_run()) {
-                exec->synchronize();
-                readable->read(data);
-                exec->synchronize();
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic.warmup_run()) {
+                    exec->synchronize();
+                    readable->read(data);
+                    exec->synchronize();
+                }
             }
             // timed run
             for (auto _ : ic.run()) {
+                auto range = annotate("repetition");
                 readable->read(data);
             }
         } else {
@@ -156,13 +162,17 @@ struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
             auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);
 
             // warm run
-            for (auto _ : ic.warmup_run()) {
-                exec->synchronize();
-                mtx_to->copy_from(mtx_from);
-                exec->synchronize();
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic.warmup_run()) {
+                    exec->synchronize();
+                    mtx_to->copy_from(mtx_from);
+                    exec->synchronize();
+                }
             }
             // timed run
             for (auto _ : ic.run()) {
+                auto range = annotate("repetition");
                 mtx_to->copy_from(mtx_from);
             }
         }
diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index 4bb63032550..20feecf5ccf 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -182,8 +182,8 @@ struct MatrixStatistics : Benchmark<empty_state> {
         return Generator::describe_config(test_case);
     }
 
-    int setup(std::shared_ptr<gko::Executor> exec,
-              json& test_case) const override
+    empty_state setup(std::shared_ptr<gko::Executor> exec,
+                      json& test_case) const override
     {
         auto data = Generator::generate_matrix_data(test_case);
         std::clog << "Matrix is of size (" << data.size[0] << ", "
@@ -193,12 +193,13 @@ struct MatrixStatistics : Benchmark<empty_state> {
         test_case["nonzeros"] = data.nonzeros.size();
 
         extract_matrix_statistics(data, test_case["problem"]);
-        return 0;
+        return {};
     }
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
-             int& data, const std::string& operation_name,
+             annotate_functor annotate, empty_state& data,
+             const std::string& operation_name,
              json& operation_case) const override
     {}
 };
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 7c130328d34..98f116f9b12 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
 #include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
 #include "benchmark/utils/iteration_control.hpp"
@@ -199,7 +200,7 @@ struct PreconditionerBenchmark : Benchmark<preconditioner_benchmark_state> {
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
-             preconditioner_benchmark_state& state,
+             annotate_functor annotate, preconditioner_benchmark_state& state,
              const std::string& encoded_precond_name,
              json& precond_case) const override
     {
@@ -219,12 +220,17 @@ struct PreconditionerBenchmark : Benchmark<preconditioner_benchmark_state> {
 
             auto precond = precond_factory.at(decoded_precond_name)(exec);
 
-            for (auto _ : ic_apply.warmup_run()) {
-                precond->generate(state.system_matrix)->apply(state.b, x_clone);
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic_apply.warmup_run()) {
+                    precond->generate(state.system_matrix)
+                        ->apply(state.b, x_clone);
+                }
             }
 
             std::unique_ptr<gko::LinOp> precond_op;
             for (auto _ : ic_gen.run()) {
+                auto range = annotate("repetition generate");
                 precond_op = precond->generate(state.system_matrix);
             }
 
@@ -234,6 +240,7 @@ struct PreconditionerBenchmark : Benchmark<preconditioner_benchmark_state> {
                 ic_gen.get_num_repetitions();
 
             for (auto _ : ic_apply.run()) {
+                auto range = annotate("repetition apply");
                 precond_op->apply(state.b, x_clone);
             }
 
diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp
index 4976e5759d4..597ab76729a 100644
--- a/benchmark/solver/solver_common.hpp
+++ b/benchmark/solver/solver_common.hpp
@@ -458,6 +458,7 @@ struct SolverBenchmark : Benchmark<solver_benchmark_state<Generator>> {
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate,
              solver_benchmark_state<Generator>& state,
              const std::string& encoded_solver_name,
              json& solver_case) const override
@@ -482,14 +483,17 @@ struct SolverBenchmark : Benchmark<solver_benchmark_state<Generator>> {
 
         // warm run
         std::shared_ptr<gko::LinOp> solver;
-        for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(state.x);
-            auto precond = precond_factory.at(precond_name)(exec);
-            solver = generate_solver(exec, give(precond), solver_name,
-                                     FLAGS_warmup_max_iters)
-                         ->generate(state.system_matrix);
-            solver->apply(state.b, x_clone);
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                auto x_clone = clone(state.x);
+                auto precond = precond_factory.at(precond_name)(exec);
+                solver = generate_solver(exec, give(precond), solver_name,
+                                         FLAGS_warmup_max_iters)
+                             ->generate(state.system_matrix);
+                solver->apply(state.b, x_clone);
+                exec->synchronize();
+            }
         }
 
         // detail run
@@ -566,6 +570,7 @@ struct SolverBenchmark : Benchmark<solver_benchmark_state<Generator>> {
         auto apply_timer = ic.get_timer();
         auto x_clone = clone(state.x);
         for (auto status : ic.run(false)) {
+            auto range = annotate("repetition");
             x_clone = clone(state.x);
 
             exec->synchronize();
diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index 21df4d9c448..5d479eb7fc0 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -128,7 +128,8 @@ struct SparseBlasBenchmark : Benchmark<std::unique_ptr<Mtx>> {
 
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
-             std::unique_ptr<Mtx>& mtx, const std::string& operation_name,
+             annotate_functor annotate, std::unique_ptr<Mtx>& mtx,
+             const std::string& operation_name,
              json& operation_case) const override
     {
         auto op = get_operation(operation_name, mtx.get());
@@ -136,16 +137,20 @@ struct SparseBlasBenchmark : Benchmark<std::unique_ptr<Mtx>> {
         IterationControl ic(timer);
 
         // warm run
-        for (auto _ : ic.warmup_run()) {
-            op->prepare();
-            exec->synchronize();
-            op->run();
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                op->prepare();
+                exec->synchronize();
+                op->run();
+                exec->synchronize();
+            }
         }
 
         // timed run
         op->prepare();
         for (auto _ : ic.run()) {
+            auto range = annotate("repetition");
             op->run();
         }
         const auto runtime = ic.compute_time(FLAGS_timer_method);
diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
index 4a7d014de8b..f589077834e 100644
--- a/benchmark/spmv/spmv_common.hpp
+++ b/benchmark/spmv/spmv_common.hpp
@@ -130,7 +130,7 @@ struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
     }
 
     void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
-             spmv_benchmark_state<Generator>& state,
+             annotate_functor annotate, spmv_benchmark_state<Generator>& state,
              const std::string& format_name, json& format_case) const override
     {
         auto system_matrix = generator.generate_matrix_with_format(
@@ -149,11 +149,14 @@ struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
 
         IterationControl ic{timer};
         // warm run
-        for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(state.x);
-            exec->synchronize();
-            system_matrix->apply(state.b, x_clone);
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                auto x_clone = clone(state.x);
+                exec->synchronize();
+                system_matrix->apply(state.b, x_clone);
+                exec->synchronize();
+            }
         }
 
         // tuning run
@@ -192,6 +195,7 @@ struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
         // timed run
         auto x_clone = clone(state.x);
         for (auto _ : ic.run()) {
+            auto range = annotate("repetition");
             system_matrix->apply(state.b, x_clone);
         }
         format_case["time"] = ic.compute_time(FLAGS_timer_method);
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 550f6fe2720..6012cb6c77b 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -245,6 +245,32 @@ std::shared_ptr<gko::log::ProfilerHook> create_profiler_hook(
 }
 
 
+struct annotate_functor {
+    gko::log::profiling_scope_guard operator()(const char* name) const
+    {
+        if (profiler_hook) {
+            return profiler_hook->user_range(name);
+        }
+        return {};
+    }
+
+    gko::log::profiling_scope_guard operator()(const char* name,
+                                               bool should_annotate) const
+    {
+        if (profiler_hook && should_annotate) {
+            return profiler_hook->user_range(name);
+        }
+        return {};
+    }
+
+    annotate_functor(std::shared_ptr<gko::log::ProfilerHook> profiler_hook)
+        : profiler_hook{std::move(profiler_hook)}
+    {}
+
+    std::shared_ptr<gko::log::ProfilerHook> profiler_hook;
+};
+
+
 // Returns a random number engine
 std::default_random_engine& get_engine()
 {
diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp
index 661c403706f..264dc3965db 100644
--- a/benchmark/utils/runner.hpp
+++ b/benchmark/utils/runner.hpp
@@ -102,8 +102,8 @@ struct Benchmark {
 
     /** Runs a single operation of the benchmark */
     virtual void run(std::shared_ptr<gko::Executor> exec,
-                     std::shared_ptr<Timer> timer, State& state,
-                     const std::string& operation,
+                     std::shared_ptr<Timer> timer, annotate_functor annotate,
+                     State& state, const std::string& operation,
                      json& operation_case) const = 0;
 
     /** Post-process test case info. */
@@ -139,13 +139,7 @@ void run_test_cases(const Benchmark<State>& benchmark,
     if (profiler_hook) {
         exec->add_logger(profiler_hook);
     }
-    auto annotate =
-        [profiler_hook](const char* name) -> gko::log::profiling_scope_guard {
-        if (profiler_hook) {
-            return profiler_hook->user_range(name);
-        }
-        return {};
-    };
+    auto annotate = annotate_functor(profiler_hook);
 
     for (auto& test_case : test_cases) {
         try {
@@ -174,8 +168,8 @@ void run_test_cases(const Benchmark<State>& benchmark,
                 auto& operation_case = benchmark_case[operation_name];
                 try {
                     auto operation_range = annotate(operation_name.c_str());
-                    benchmark.run(exec, timer, test_case_state, operation_name,
-                                  operation_case);
+                    benchmark.run(exec, timer, annotate, test_case_state,
+                                  operation_name, operation_case);
                     operation_case["completed"] = true;
                 } catch (const std::exception& e) {
                     operation_case["completed"] = false;

From e2f29961e909163af01ba63e5b5cf1c41e64cc5c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sat, 19 Aug 2023 12:19:10 +0200
Subject: [PATCH 09/13] update test output

---
 benchmark/test/reference/blas.profile.stderr              | 6 ++++++
 benchmark/test/reference/conversion.profile.stderr        | 8 ++++++++
 .../test/reference/distributed_solver.profile.stderr      | 2 ++
 .../reference/multi_vector_distributed.profile.stderr     | 6 ++++++
 benchmark/test/reference/preconditioner.profile.stderr    | 4 ++++
 benchmark/test/reference/solver.profile.stderr            | 2 ++
 benchmark/test/reference/sparse_blas.profile.stderr       | 2 ++
 benchmark/test/reference/spmv.profile.stderr              | 2 ++
 benchmark/test/reference/spmv_distributed.profile.stderr  | 2 ++
 9 files changed, 34 insertions(+)

diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr
index 529fc16009c..7307fb0ad7e 100644
--- a/benchmark/test/reference/blas.profile.stderr
+++ b/benchmark/test/reference/blas.profile.stderr
@@ -10,8 +10,10 @@ DEBUG: begin n = 100
 DEBUG: begin copy
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
+DEBUG: begin repetition
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
+DEBUG: end   repetition
 DEBUG: end   copy
 	Running blas: axpy
 DEBUG: begin axpy
@@ -21,8 +23,10 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
+DEBUG: begin repetition
 DEBUG: begin dense::add_scaled
 DEBUG: end   dense::add_scaled
+DEBUG: end   repetition
 DEBUG: end   axpy
 	Running blas: scal
 DEBUG: begin scal
@@ -30,7 +34,9 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
+DEBUG: begin repetition
 DEBUG: begin dense::scale
 DEBUG: end   dense::scale
+DEBUG: end   repetition
 DEBUG: end   scal
 DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index a233579c721..3a4301b13eb 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -11,12 +11,14 @@ DEBUG: end   components::aos_to_soa
 DEBUG: begin stencil(100,7pt)
 	Running conversion: coo-read
 DEBUG: begin coo-read
+DEBUG: begin repetition
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin copy
 DEBUG: end   copy
+DEBUG: end   repetition
 DEBUG: end   coo-read
 	Running conversion: coo-csr
 DEBUG: begin coo-csr
@@ -28,6 +30,7 @@ DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
+DEBUG: begin repetition
 DEBUG: begin copy(<typename>)
 DEBUG: begin copy
 DEBUG: end   copy
@@ -36,11 +39,13 @@ DEBUG: end   copy
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
 DEBUG: end   copy(<typename>)
+DEBUG: end   repetition
 DEBUG: end   coo-csr
 	Running conversion: csr-read
 DEBUG: begin csr-read
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
+DEBUG: begin repetition
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin copy
@@ -49,6 +54,7 @@ DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: end   repetition
 DEBUG: end   csr-read
 	Running conversion: csr-coo
 DEBUG: begin csr-coo
@@ -62,6 +68,7 @@ DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin repetition
 DEBUG: begin copy(<typename>)
 DEBUG: begin copy
 DEBUG: end   copy
@@ -70,5 +77,6 @@ DEBUG: end   copy
 DEBUG: begin components::convert_ptrs_to_idxs
 DEBUG: end   components::convert_ptrs_to_idxs
 DEBUG: end   copy(<typename>)
+DEBUG: end   repetition
 DEBUG: end   csr-coo
 DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
index 4ea20730117..227737e56b3 100644
--- a/benchmark/test/reference/distributed_solver.profile.stderr
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -90,6 +90,7 @@ DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
@@ -420,6 +421,7 @@ DEBUG: end   check(<typename>)
 DEBUG: end   check(<typename>)
 DEBUG: end   iteration
 DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr
index 102330e38f4..85bd138514b 100644
--- a/benchmark/test/reference/multi_vector_distributed.profile.stderr
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr
@@ -46,8 +46,10 @@ DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
+DEBUG: begin repetition
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
+DEBUG: end   repetition
 DEBUG: end   copy
 	Running blas: axpy
 DEBUG: begin axpy
@@ -93,8 +95,10 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
+DEBUG: begin repetition
 DEBUG: begin dense::add_scaled
 DEBUG: end   dense::add_scaled
+DEBUG: end   repetition
 DEBUG: end   axpy
 	Running blas: scal
 DEBUG: begin scal
@@ -120,7 +124,9 @@ DEBUG: begin dense::fill
 DEBUG: end   dense::fill
 DEBUG: begin dense::fill
 DEBUG: end   dense::fill
+DEBUG: begin repetition
 DEBUG: begin dense::scale
 DEBUG: end   dense::scale
+DEBUG: end   repetition
 DEBUG: end   scal
 DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
index 610dfe464ec..e2069c318d2 100644
--- a/benchmark/test/reference/preconditioner.profile.stderr
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -31,13 +31,17 @@ DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
+DEBUG: begin repetition generate
 DEBUG: begin generate(<typename>)
 DEBUG: end   generate(<typename>)
+DEBUG: end   repetition generate
+DEBUG: begin repetition apply
 DEBUG: begin apply(<typename>)
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 DEBUG: end   apply(<typename>)
+DEBUG: end   repetition apply
 DEBUG: end   none
 DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
index 238591eb0c9..5e1e2cdb312 100644
--- a/benchmark/test/reference/solver.profile.stderr
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -34,6 +34,7 @@ DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
@@ -282,6 +283,7 @@ DEBUG: end   check(<typename>)
 DEBUG: end   check(<typename>)
 DEBUG: end   iteration
 DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
 DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
index 60cf41ccbae..fd991de7063 100644
--- a/benchmark/test/reference/sparse_blas.profile.stderr
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -15,9 +15,11 @@ DEBUG: end   components::convert_idxs_to_ptrs
 DEBUG: begin stencil(100,7pt)
 	Running sparse_blas: transpose
 DEBUG: begin transpose
+DEBUG: begin repetition
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin csr::transpose
 DEBUG: end   csr::transpose
+DEBUG: end   repetition
 DEBUG: end   transpose
 DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
index 2299614c6c4..1cc24a5f186 100644
--- a/benchmark/test/reference/spmv.profile.stderr
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -28,9 +28,11 @@ DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
 DEBUG: begin apply(<typename>)
 DEBUG: begin coo::spmv
 DEBUG: end   coo::spmv
 DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
 DEBUG: end   coo
 DEBUG: end   stencil(100,7pt)
diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr
index b44cef7f3f6..f0d28332ef0 100644
--- a/benchmark/test/reference/spmv_distributed.profile.stderr
+++ b/benchmark/test/reference/spmv_distributed.profile.stderr
@@ -122,6 +122,7 @@ DEBUG: begin copy(<typename>)
 DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
 DEBUG: begin apply(<typename>)
 DEBUG: begin dense::row_gather
 DEBUG: end   dense::row_gather
@@ -134,5 +135,6 @@ DEBUG: begin csr::advanced_spmv
 DEBUG: end   csr::advanced_spmv
 DEBUG: end   advanced_apply(<typename>)
 DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
 DEBUG: end   csr-csr
 DEBUG: end   stencil(100,7pt,stencil)

From 49ffd96d68d39c4a80e97f72ff9c43923b856a3c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 22 Aug 2023 17:52:30 +0200
Subject: [PATCH 10/13] update documentation

---
 ABOUT-LICENSING.md        | 105 +++-----------------------------------
 INSTALL.md                |   6 +--
 benchmark/CMakeLists.txt  |   4 +-
 dev_tools/scripts/regroup |   2 +-
 4 files changed, 14 insertions(+), 103 deletions(-)

diff --git a/ABOUT-LICENSING.md b/ABOUT-LICENSING.md
index df081e2211b..d6e68911d1a 100644
--- a/ABOUT-LICENSING.md
+++ b/ABOUT-LICENSING.md
@@ -76,7 +76,7 @@ the following license:
 
 When compiling Ginkgo with `-DGINKGO_BUILD_BENCHMARKS=ON` the build system will
 download, build, and link [gflags](https://github.com/gflags/gflags) and
-[RapidJSON](https://github.com/Tencent/rapidjson) with the
+[nlohmann-json](https://github.com/nlohmann/json) with the
 benchmark suites. gtest is available under the following license:
 
 > Copyright (c) 2006, Google Inc.
@@ -108,110 +108,22 @@ benchmark suites. gtest is available under the following license:
 > (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 > OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-RapidJSON is available under the following license (note that Ginkgo's build
-system automatically removes the `bin/jsonchecker/` directory which is licensed
-under the problematic JSON license):
+nlohmann-json is available under the following license:
 
-> Tencent is pleased to support the open source community by making RapidJSON
-> available.
->
-> Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.  All
-> rights reserved.
->
-> If you have downloaded a copy of the RapidJSON binary from Tencent, please
-> note that the RapidJSON binary is licensed under the MIT License.  If you have
-> downloaded a copy of the RapidJSON source code from Tencent, please note that
-> RapidJSON source code is licensed under the MIT License, except for the
-> third-party components listed below which are subject to different license
-> terms.  Your integration of RapidJSON into your own projects may require
-> compliance with the MIT License, as well as the other licenses applicable to
-> the third-party components included within RapidJSON. To avoid the problematic
-> JSON license in your own projects, it's sufficient to exclude the
-> bin/jsonchecker/ directory, as it's the only code under the JSON license.  A
-> copy of the MIT License is included in this file.
->
-> Other dependencies and licenses:
->
-> Open Source Software Licensed Under the BSD License:
-> --------------------------------------------------------------------
->
-> The msinttypes r29
->
-> Copyright (c) 2006-2013 Alexander Chemeris
-> All rights reserved.
->
-> Redistribution and use in source and binary forms, with or without
-> modification, are permitted provided that the following conditions are met:
->
-> * Redistributions of source code must retain the above copyright notice, this
->   list of conditions and the following disclaimer.
-> * Redistributions in binary form must reproduce the above copyright notice,
->   this list of conditions and the following disclaimer in the documentation
->   and/or other materials provided with the distribution.
-> * Neither the name of  copyright holder nor the names of its contributors may
->   be used to endorse or promote products derived from this software without
->   specific prior written permission.
->
-> THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-> EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-> WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-> DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
-> DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-> (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-> LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-> ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-> (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-> SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
->
-> Open Source Software Licensed Under the JSON License:
-> --------------------------------------------------------------------
->
-> json.org
-> Copyright (c) 2002
-> JSON.org All Rights Reserved.
->
-> JSON_checker
-> Copyright (c) 2002 JSON.org
-> All Rights Reserved.
->
->
-> Terms of the JSON License:
-> ---------------------------------------------------
->
-> Permission is hereby granted, free of charge, to any person obtaining a copy
-> of this software and associated documentation files (the "Software"), to deal
-> in the Software without restriction, including without limitation the rights
-> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-> copies of the Software, and to permit persons to whom the Software is
-> furnished to do so, subject to the following conditions:
->
-> The above copyright notice and this permission notice shall be included in all
-> copies or substantial portions of the Software.
->
-> The Software shall be used for Good, not Evil.
->
-> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-> SOFTWARE.
->
->
-> Terms of the MIT License:
-> --------------------------------------------------------------------
->
+> MIT License 
+> 
+> Copyright (c) 2013-2022 Niels Lohmann
+> 
 > Permission is hereby granted, free of charge, to any person obtaining a copy
 > of this software and associated documentation files (the "Software"), to deal
 > in the Software without restriction, including without limitation the rights
 > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 > copies of the Software, and to permit persons to whom the Software is
 > furnished to do so, subject to the following conditions:
->
+
 > The above copyright notice and this permission notice shall be included in all
 > copies or substantial portions of the Software.
->
+> 
 > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -220,7 +132,6 @@ under the problematic JSON license):
 > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 > SOFTWARE.
 
-
 For generating the documentation of Ginkgo, some scripts from the deal.II
 library are used. You can refer to the `doc/` folder to see which files are a
 modified version of deal.II's documentation generation scripts. Additionally,
diff --git a/INSTALL.md b/INSTALL.md
index 5f788ed0e28..b29358d4eb6 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -31,7 +31,7 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests
     to speed them up, default is `OFF`.
 *   `-DGINKGO_BUILD_BENCHMARKS={ON, OFF}` builds Ginkgo's benchmarks
-    (will download gflags and rapidjson), default is `ON`.
+    (will download gflags and nlohmann-json), default is `ON`.
 *   `-DGINKGO_BUILD_EXAMPLES={ON, OFF}` builds Ginkgo's examples, default is `ON`
 *   `-DGINKGO_BUILD_EXTLIB_EXAMPLE={ON, OFF}` builds the interfacing example
     with deal.II, default is `OFF`.
@@ -205,7 +205,7 @@ packages can be turned off by disabling the relevant options.
   Test](https://github.com/google/googletest);
 + GINKGO_BUILD_BENCHMARKS=ON: For argument management we use
   [gflags](https://github.com/gflags/gflags) and for JSON parsing we use
-  [RapidJSON](https://github.com/Tencent/rapidjson);
+  [nlohmann-json](https://github.com/nlohmann/json);
 + GINKGO_DEVEL_TOOLS=ON:
   [git-cmake-format](https://github.com/gflegar/git-cmake-format) is our CMake
   helper for code formatting.
@@ -224,7 +224,7 @@ packages can be turned off by disabling the relevant options.
 Ginkgo attempts to use pre-installed versions of these package if they match
 version requirements using `find_package`. Otherwise, the configuration step
 will download the files for each of the packages `GTest`, `gflags`,
-`RapidJSON` and `hwloc` and build them internally.
+`nlohmann-json` and `hwloc` and build them internally.
 
 Note that, if the external packages were not installed to the default location,
 the CMake option `-DCMAKE_PREFIX_PATH=<path-list>` needs to be set to the
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index e993ee6cf0c..fd04620f595 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -46,7 +46,7 @@ endfunction()
 
 
 # Generates an executable for one precision. Each executable will be linked to
-# `ginkgo`, `gflags` and `rapidjson`.
+# `ginkgo`, `gflags` and `nlohmann-json`.
 # Note: This should only be used by `ginkgo_add_typed_benchmark_executables`
 #
 # \param name            name for the executable to create (including type suffix)
@@ -96,7 +96,7 @@ endfunction(ginkgo_add_single_benchmark_executable)
 
 
 # Generates an executable for each supported precision. Each executable will be
-# linked to `ginkgo`, `gflags` and `rapidjson`.
+# linked to `ginkgo`, `gflags` and `nlohmann-json`.
 #
 # \param name            base-name for the executable to create
 # \param use_lib_linops  Boolean indicating if linking against hipsparse/cusparse
diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
index 85eade99289..e35bd37efee 100644
--- a/dev_tools/scripts/regroup
+++ b/dev_tools/scripts/regroup
@@ -1,6 +1,6 @@
 IncludeBlocks: Regroup
 IncludeCategories:
-  - Regex: '^<(rapidjson|gflags|gtest|papi).*'
+  - Regex: '^<(nlohmann|gflags|gtest|papi).*'
     Priority: 3
   - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi|nvToolsExt).*'
     Priority: 2

From a725d3cd93a165037180b5696ae381b1bfa3229d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 23 Aug 2023 13:18:37 +0200
Subject: [PATCH 11/13] review updates

- remove unnecessary stdin in tests
- simplify validate_config
- consistently use pointer members instead of reference members

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 benchmark/solver/solver_common.hpp         |  6 +---
 benchmark/test/blas.py                     |  2 --
 benchmark/test/multi_vector_distributed.py |  2 --
 benchmark/utils/loggers.hpp                | 34 +++++++++++-----------
 4 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp
index 597ab76729a..0248ab8e757 100644
--- a/benchmark/solver/solver_common.hpp
+++ b/benchmark/solver/solver_common.hpp
@@ -414,11 +414,7 @@ struct SolverBenchmark : Benchmark<solver_benchmark_state<Generator>> {
 
     bool validate_config(const json& value) const override
     {
-        return ((value.contains("size") && value.contains("stencil") &&
-                 value["size"].is_number_integer() &&
-                 value["stencil"].is_string()) ||
-                (value.contains("filename") &&
-                 value["filename"].is_string())) &&
+        return generator.validate_config(value) &&
                (value.contains("optimal") &&
                 value["optimal"].contains("spmv") &&
                 value["optimal"]["spmv"].is_string());
diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py
index 160d5364e20..ff5bddc5d08 100755
--- a/benchmark/test/blas.py
+++ b/benchmark/test/blas.py
@@ -22,7 +22,6 @@
     ["-input", str(test_framework.sourcepath / "input.blas.json")],
     expected_stdout="blas.simple.stdout",
     expected_stderr="blas.simple.stderr",
-    stdin='[{"n": 100}]',
 )
 
 # profiler annotations
@@ -30,5 +29,4 @@
     ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"],
     expected_stdout="blas.profile.stdout",
     expected_stderr="blas.profile.stderr",
-    stdin='[{"n": 100}]',
 )
diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py
index 1e0c4c8adf5..c62cb8ebd17 100644
--- a/benchmark/test/multi_vector_distributed.py
+++ b/benchmark/test/multi_vector_distributed.py
@@ -24,7 +24,6 @@
     ["-input", str(test_framework.sourcepath / "input.blas.json")],
     expected_stdout="multi_vector_distributed.simple.stdout",
     expected_stderr="multi_vector_distributed.simple.stderr",
-    stdin='[{"n": 100}]',
     num_procs=3,
 )
 
@@ -33,6 +32,5 @@
     ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"],
     expected_stdout="multi_vector_distributed.profile.stdout",
     expected_stderr="multi_vector_distributed.profile.stderr",
-    stdin='[{"n": 100}]',
     num_procs=3,
 )
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index 1e651811f0f..89ea6108eda 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -179,16 +179,16 @@ struct ResidualLogger : gko::log::Logger {
                                const gko::array<gko::stopping_status>* status,
                                bool all_stopped) const override
     {
-        timestamps.push_back(std::chrono::duration<double>(
-                                 std::chrono::steady_clock::now() - start)
-                                 .count());
+        timestamps->push_back(std::chrono::duration<double>(
+                                  std::chrono::steady_clock::now() - start)
+                                  .count());
         if (residual_norm) {
-            rec_res_norms.push_back(
+            rec_res_norms->push_back(
                 get_norm(gko::as<vec<rc_vtype>>(residual_norm)));
         } else {
             gko::detail::vector_dispatch<rc_vtype>(
                 residual, [&](const auto v_residual) {
-                    rec_res_norms.push_back(compute_norm2(v_residual));
+                    rec_res_norms->push_back(compute_norm2(v_residual));
                 });
         }
         if (solution) {
@@ -196,18 +196,18 @@ struct ResidualLogger : gko::log::Logger {
                 rc_vtype>(solution, [&](auto v_solution) {
                 using concrete_type =
                     std::remove_pointer_t<std::decay_t<decltype(v_solution)>>;
-                true_res_norms.push_back(compute_residual_norm(
+                true_res_norms->push_back(compute_residual_norm(
                     matrix, gko::as<concrete_type>(b), v_solution));
             });
         } else {
-            true_res_norms.push_back(-1.0);
+            true_res_norms->push_back(-1.0);
         }
         if (implicit_sq_residual_norm) {
-            implicit_res_norms.push_back(std::sqrt(
+            implicit_res_norms->push_back(std::sqrt(
                 get_norm(gko::as<vec<rc_vtype>>(implicit_sq_residual_norm))));
             has_implicit_res_norm = true;
         } else {
-            implicit_res_norms.push_back(-1.0);
+            implicit_res_norms->push_back(-1.0);
         }
     }
 
@@ -219,11 +219,11 @@ struct ResidualLogger : gko::log::Logger {
           matrix{matrix.get()},
           b{b.get()},
           start{std::chrono::steady_clock::now()},
-          rec_res_norms{rec_res_norms},
-          true_res_norms{true_res_norms},
+          rec_res_norms{&rec_res_norms},
+          true_res_norms{&true_res_norms},
           has_implicit_res_norm{},
-          implicit_res_norms{implicit_res_norms},
-          timestamps{timestamps}
+          implicit_res_norms{&implicit_res_norms},
+          timestamps{&timestamps}
     {}
 
     bool has_implicit_res_norms() const { return has_implicit_res_norm; }
@@ -232,11 +232,11 @@ struct ResidualLogger : gko::log::Logger {
     const gko::LinOp* matrix;
     const gko::LinOp* b;
     std::chrono::steady_clock::time_point start;
-    json& rec_res_norms;
-    json& true_res_norms;
+    json* rec_res_norms;
+    json* true_res_norms;
     mutable bool has_implicit_res_norm;
-    json& implicit_res_norms;
-    json& timestamps;
+    json* implicit_res_norms;
+    json* timestamps;
 };
 
 

From 7b482dcf416e940b76c775ec67c0a39b286852aa Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 29 Aug 2023 19:24:34 +0200
Subject: [PATCH 12/13] review updates

- don't install nlohmann-json
- simplify code
- improve config description formatting

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
---
 benchmark/preconditioner/preconditioner.cpp                | 3 +--
 benchmark/spmv/spmv_common.hpp                             | 3 ---
 benchmark/test/CMakeLists.txt                              | 2 +-
 benchmark/test/reference/conversion.all.stderr             | 2 +-
 benchmark/test/reference/conversion.profile.stderr         | 6 +++---
 benchmark/test/reference/conversion.simple.stderr          | 2 +-
 benchmark/test/reference/distributed_solver.profile.stderr | 6 +++---
 benchmark/test/reference/distributed_solver.simple.stderr  | 2 +-
 benchmark/test/reference/matrix_statistics.simple.stderr   | 2 +-
 benchmark/test/reference/preconditioner.profile.stderr     | 6 +++---
 benchmark/test/reference/preconditioner.simple.stderr      | 2 +-
 benchmark/test/reference/solver.profile.stderr             | 6 +++---
 benchmark/test/reference/solver.simple.stderr              | 2 +-
 benchmark/test/reference/sparse_blas.profile.stderr        | 6 +++---
 benchmark/test/reference/sparse_blas.simple.stderr         | 2 +-
 benchmark/test/reference/spmv.profile.stderr               | 6 +++---
 benchmark/test/reference/spmv.simple.stderr                | 2 +-
 benchmark/test/reference/spmv_distributed.profile.stderr   | 6 +++---
 benchmark/test/reference/spmv_distributed.simple.stderr    | 2 +-
 benchmark/utils/generator.hpp                              | 6 +++---
 third_party/nlohmann_json/CMakeLists.txt                   | 1 +
 21 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 98f116f9b12..074fe202e6c 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -205,9 +205,8 @@ struct PreconditionerBenchmark : Benchmark<preconditioner_benchmark_state> {
              json& precond_case) const override
     {
         auto decoded_precond_name = precond_decoder.at(encoded_precond_name);
-        precond_case["generate"] = json::object();
-        precond_case["apply"] = json::object();
         for (auto stage : {"generate", "apply"}) {
+            precond_case[stage] = json::object();
             precond_case[stage]["components"] = json::object();
         }
 
diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
index f589077834e..c85642bb5f1 100644
--- a/benchmark/spmv/spmv_common.hpp
+++ b/benchmark/spmv/spmv_common.hpp
@@ -211,9 +211,6 @@ struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
         std::string best_format;
         // find the fastest among all formats we tested
         for (const auto& format : formats) {
-            if (!test_case[name].contains(format)) {
-                continue;
-            }
             auto& format_case = test_case[name][format];
             if (format_case.contains("completed") &&
                 format_case["completed"].template get<bool>()) {
diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt
index 1cd589927fa..2f43b6eaf71 100644
--- a/benchmark/test/CMakeLists.txt
+++ b/benchmark/test/CMakeLists.txt
@@ -25,4 +25,4 @@ if (GINKGO_BUILD_MPI)
     add_benchmark_test(multi_vector_distributed)
     add_benchmark_test(spmv_distributed)
     add_benchmark_test(solver_distributed)
-endif()
\ No newline at end of file
+endif()
diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr
index 77ff50a1b89..f6f1002e443 100644
--- a/benchmark/test/reference/conversion.all.stderr
+++ b/benchmark/test/reference/conversion.all.stderr
@@ -4,7 +4,7 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr,ell,sellp,hybrid
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 	Running conversion: coo-read
 	Running conversion: coo-csr
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
index 3a4301b13eb..b25fb4d42ee 100644
--- a/benchmark/test/reference/conversion.profile.stderr
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -4,11 +4,11 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
-DEBUG: begin stencil(100,7pt)
+DEBUG: begin stencil(100, 7pt)
 	Running conversion: coo-read
 DEBUG: begin coo-read
 DEBUG: begin repetition
@@ -79,4 +79,4 @@ DEBUG: end   components::convert_ptrs_to_idxs
 DEBUG: end   copy(<typename>)
 DEBUG: end   repetition
 DEBUG: end   csr-coo
-DEBUG: end   stencil(100,7pt)
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr
index 9b51effac09..53777a4fc53 100644
--- a/benchmark/test/reference/conversion.simple.stderr
+++ b/benchmark/test/reference/conversion.simple.stderr
@@ -4,7 +4,7 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo,csr
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 	Running conversion: coo-read
 	Running conversion: coo-csr
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
index 227737e56b3..e8ef115f8c2 100644
--- a/benchmark/test/reference/distributed_solver.profile.stderr
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case stencil(100,7pt,stencil)
+Running test case stencil(100, 7pt, stencil)
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
 DEBUG: begin components::fill_array
@@ -77,7 +77,7 @@ DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 Matrix is of size (125, 125)
-DEBUG: begin stencil(100,7pt,stencil)
+DEBUG: begin stencil(100, 7pt, stencil)
 	Running solver: cg
 DEBUG: begin cg
 DEBUG: begin dense::compute_squared_norm2
@@ -445,4 +445,4 @@ DEBUG: end   dense::compute_sqrt
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: end   cg
-DEBUG: end   stencil(100,7pt,stencil)
+DEBUG: end   stencil(100, 7pt, stencil)
diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr
index 607081a3949..bdf57c2d0e1 100644
--- a/benchmark/test/reference/distributed_solver.simple.stderr
+++ b/benchmark/test/reference/distributed_solver.simple.stderr
@@ -5,6 +5,6 @@ Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case stencil(100,7pt,stencil)
+Running test case stencil(100, 7pt, stencil)
 Matrix is of size (125, 125)
 	Running solver: cg
diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr
index d02edbc44da..bfaa411873e 100644
--- a/benchmark/test/reference/matrix_statistics.simple.stderr
+++ b/benchmark/test/reference/matrix_statistics.simple.stderr
@@ -1,4 +1,4 @@
 This is Ginkgo 1.7.0 (develop)
     running with core module 1.7.0 (develop)
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
index e2069c318d2..328a738583c 100644
--- a/benchmark/test/reference/preconditioner.profile.stderr
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -4,7 +4,7 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::aos_to_soa
@@ -24,7 +24,7 @@ DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
 Matrix is of size (125, 125), 725
-DEBUG: begin stencil(100,7pt)
+DEBUG: begin stencil(100, 7pt)
 	Running preconditioner: none
 DEBUG: begin none
 DEBUG: begin copy(<typename>)
@@ -44,4 +44,4 @@ DEBUG: end   copy(<typename>)
 DEBUG: end   apply(<typename>)
 DEBUG: end   repetition apply
 DEBUG: end   none
-DEBUG: end   stencil(100,7pt)
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr
index 0090e180d2b..a428671486f 100644
--- a/benchmark/test/reference/preconditioner.simple.stderr
+++ b/benchmark/test/reference/preconditioner.simple.stderr
@@ -4,6 +4,6 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 Running with preconditioners: none
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 	Running preconditioner: none
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
index 5e1e2cdb312..a9846dff61f 100644
--- a/benchmark/test/reference/solver.profile.stderr
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
 DEBUG: begin components::aos_to_soa
@@ -23,7 +23,7 @@ DEBUG: begin dense::copy
 DEBUG: end   dense::copy
 DEBUG: end   copy(<typename>)
 Matrix is of size (125, 125)
-DEBUG: begin stencil(100,7pt)
+DEBUG: begin stencil(100, 7pt)
 	Running solver: cg
 DEBUG: begin cg
 DEBUG: begin dense::compute_norm2_dispatch
@@ -297,4 +297,4 @@ DEBUG: end   dense::compute_norm2_dispatch
 DEBUG: begin copy
 DEBUG: end   copy
 DEBUG: end   cg
-DEBUG: end   stencil(100,7pt)
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr
index 659dd026588..d9c04b69cf5 100644
--- a/benchmark/test/reference/solver.simple.stderr
+++ b/benchmark/test/reference/solver.simple.stderr
@@ -5,6 +5,6 @@ Running with 2 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 Running cg with 1000 iterations and residual goal of 1.000000e-06
 The number of right hand sides is 1
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125)
 	Running solver: cg
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
index fd991de7063..70a9299ccae 100644
--- a/benchmark/test/reference/sparse_blas.profile.stderr
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -4,7 +4,7 @@ Running on reference(0)
 Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 DEBUG: begin components::fill_array
 DEBUG: end   components::fill_array
@@ -12,7 +12,7 @@ DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
 DEBUG: begin components::convert_idxs_to_ptrs
 DEBUG: end   components::convert_idxs_to_ptrs
-DEBUG: begin stencil(100,7pt)
+DEBUG: begin stencil(100, 7pt)
 	Running sparse_blas: transpose
 DEBUG: begin transpose
 DEBUG: begin repetition
@@ -22,4 +22,4 @@ DEBUG: begin csr::transpose
 DEBUG: end   csr::transpose
 DEBUG: end   repetition
 DEBUG: end   transpose
-DEBUG: end   stencil(100,7pt)
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr
index 1f2bb34809f..fe6cf23d5b7 100644
--- a/benchmark/test/reference/sparse_blas.simple.stderr
+++ b/benchmark/test/reference/sparse_blas.simple.stderr
@@ -4,6 +4,6 @@ Running on reference(0)
 Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The operations are transpose
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
index 1cc24a5f186..3c3ec3b7cfe 100644
--- a/benchmark/test/reference/spmv.profile.stderr
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are coo
 The number of right hand sides is 1
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 DEBUG: begin components::aos_to_soa
 DEBUG: end   components::aos_to_soa
 DEBUG: begin dense::fill
@@ -19,7 +19,7 @@ DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
 Matrix is of size (125, 125), 725
-DEBUG: begin stencil(100,7pt)
+DEBUG: begin stencil(100, 7pt)
 	Running spmv: coo
 DEBUG: begin coo
 DEBUG: begin components::aos_to_soa
@@ -35,4 +35,4 @@ DEBUG: end   coo::spmv
 DEBUG: end   apply(<typename>)
 DEBUG: end   repetition
 DEBUG: end   coo
-DEBUG: end   stencil(100,7pt)
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr
index 9d5047febb6..97fe670aff7 100644
--- a/benchmark/test/reference/spmv.simple.stderr
+++ b/benchmark/test/reference/spmv.simple.stderr
@@ -5,6 +5,6 @@ Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are coo
 The number of right hand sides is 1
-Running test case stencil(100,7pt)
+Running test case stencil(100, 7pt)
 Matrix is of size (125, 125), 725
 	Running spmv: coo
diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr
index f0d28332ef0..dc3cfd377c7 100644
--- a/benchmark/test/reference/spmv_distributed.profile.stderr
+++ b/benchmark/test/reference/spmv_distributed.profile.stderr
@@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations
 The random seed for right hand sides is 42
 The formats are [csr]x[csr]
 The number of right hand sides is 1
-Running test case stencil(100,7pt,stencil)
+Running test case stencil(100, 7pt, stencil)
 DEBUG: begin partition::build_ranges_from_global_size
 DEBUG: end   partition::build_ranges_from_global_size
 DEBUG: begin components::fill_array
@@ -55,7 +55,7 @@ DEBUG: end   dense::fill
 DEBUG: begin dense::fill_in_matrix_data
 DEBUG: end   dense::fill_in_matrix_data
 Matrix is of size (81, 81), 144
-DEBUG: begin stencil(100,7pt,stencil)
+DEBUG: begin stencil(100, 7pt, stencil)
 	Running spmv: csr-csr
 DEBUG: begin csr-csr
 DEBUG: begin partition::build_ranges_from_global_size
@@ -137,4 +137,4 @@ DEBUG: end   advanced_apply(<typename>)
 DEBUG: end   apply(<typename>)
 DEBUG: end   repetition
 DEBUG: end   csr-csr
-DEBUG: end   stencil(100,7pt,stencil)
+DEBUG: end   stencil(100, 7pt, stencil)
diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr
index 0df742d5b9b..7c7f6fccf54 100644
--- a/benchmark/test/reference/spmv_distributed.simple.stderr
+++ b/benchmark/test/reference/spmv_distributed.simple.stderr
@@ -5,6 +5,6 @@ Running with 2 warm iterations and 10 running iterations
 The random seed for right hand sides is 42
 The formats are [csr]x[csr]
 The number of right hand sides is 1
-Running test case stencil(100,7pt,stencil)
+Running test case stencil(100, 7pt, stencil)
 Matrix is of size (81, 81), 144
 	Running spmv: csr-csr
diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp
index 257a2384634..3f26ed3f2fc 100644
--- a/benchmark/utils/generator.hpp
+++ b/benchmark/utils/generator.hpp
@@ -90,7 +90,7 @@ struct DefaultSystemGenerator {
             return config["filename"].get<std::string>();
         } else if (config.contains("stencil")) {
             std::stringstream ss;
-            ss << "stencil(" << config["size"].get<gko::int64>() << ","
+            ss << "stencil(" << config["size"].get<gko::int64>() << ", "
                << config["stencil"].get<std::string>() << ")";
             return ss.str();
         } else {
@@ -231,8 +231,8 @@ struct DistributedDefaultSystemGenerator {
             return config["filename"].get<std::string>();
         } else if (config.contains("stencil")) {
             std::stringstream ss;
-            ss << "stencil(" << config["size"].get<gko::int64>() << ","
-               << config["stencil"].get<std::string>() << ","
+            ss << "stencil(" << config["size"].get<gko::int64>() << ", "
+               << config["stencil"].get<std::string>() << ", "
                << config["comm_pattern"].get<std::string>() << ")";
             return ss.str();
         } else {
diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt
index b95cfa5606a..6f413e458b9 100644
--- a/third_party/nlohmann_json/CMakeLists.txt
+++ b/third_party/nlohmann_json/CMakeLists.txt
@@ -6,4 +6,5 @@ FetchContent_Declare(
     GIT_TAG        v3.9.1
 )
 set(JSON_BuildTests OFF CACHE INTERNAL "")
+set(JSON_Install OFF CACHE INTERNAL "")
 FetchContent_MakeAvailable(nlohmann_json)

From fe3789ce2c66919109fd2a18d3f67ec8e0bddeb8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 30 Aug 2023 11:30:08 +0200
Subject: [PATCH 13/13] keep trailing EOL

---
 benchmark/test/reference/blas.profile.stdout                   | 2 +-
 benchmark/test/reference/blas.simple.stdout                    | 2 +-
 benchmark/test/reference/conversion.all.stdout                 | 2 +-
 benchmark/test/reference/conversion.matrix.stdout              | 2 +-
 benchmark/test/reference/conversion.profile.stdout             | 2 +-
 benchmark/test/reference/conversion.simple.stdout              | 2 +-
 benchmark/test/reference/distributed_solver.matrix.stdout      | 2 +-
 benchmark/test/reference/distributed_solver.profile.stdout     | 2 +-
 benchmark/test/reference/distributed_solver.simple.stdout      | 2 +-
 benchmark/test/reference/matrix_statistics.matrix.stdout       | 2 +-
 benchmark/test/reference/matrix_statistics.simple.stdout       | 2 +-
 .../test/reference/multi_vector_distributed.profile.stdout     | 2 +-
 .../test/reference/multi_vector_distributed.simple.stdout      | 2 +-
 benchmark/test/reference/preconditioner.matrix.stdout          | 2 +-
 benchmark/test/reference/preconditioner.profile.stdout         | 2 +-
 benchmark/test/reference/preconditioner.simple.stdout          | 2 +-
 benchmark/test/reference/solver.matrix.stdout                  | 2 +-
 benchmark/test/reference/solver.profile.stdout                 | 2 +-
 benchmark/test/reference/solver.simple.stdout                  | 2 +-
 benchmark/test/reference/sparse_blas.matrix.stdout             | 2 +-
 benchmark/test/reference/sparse_blas.profile.stdout            | 2 +-
 benchmark/test/reference/sparse_blas.simple.stdout             | 2 +-
 benchmark/test/reference/spmv.matrix.stdout                    | 2 +-
 benchmark/test/reference/spmv.profile.stdout                   | 2 +-
 benchmark/test/reference/spmv.simple.stdout                    | 2 +-
 benchmark/test/reference/spmv_distributed.profile.stdout       | 2 +-
 benchmark/test/reference/spmv_distributed.simple.stdout        | 2 +-
 benchmark/test/test_framework.py.in                            | 3 ++-
 28 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout
index 8998d5eaed7..209e115b557 100644
--- a/benchmark/test/reference/blas.profile.stdout
+++ b/benchmark/test/reference/blas.profile.stdout
@@ -25,4 +25,4 @@
             }
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout
index a586a9bc57b..54745d81104 100644
--- a/benchmark/test/reference/blas.simple.stdout
+++ b/benchmark/test/reference/blas.simple.stdout
@@ -25,4 +25,4 @@
             }
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout
index 0c77d464793..e7a5b8f0f51 100644
--- a/benchmark/test/reference/conversion.all.stdout
+++ b/benchmark/test/reference/conversion.all.stdout
@@ -73,4 +73,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout
index 7f27b0c25b3..8489e4b30b4 100644
--- a/benchmark/test/reference/conversion.matrix.stdout
+++ b/benchmark/test/reference/conversion.matrix.stdout
@@ -27,4 +27,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout
index a9c3ea674fa..907eac5b951 100644
--- a/benchmark/test/reference/conversion.profile.stdout
+++ b/benchmark/test/reference/conversion.profile.stdout
@@ -28,4 +28,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout
index 81c735789d1..91b69b8a248 100644
--- a/benchmark/test/reference/conversion.simple.stdout
+++ b/benchmark/test/reference/conversion.simple.stdout
@@ -28,4 +28,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout
index ec1d258e2f4..67ac333bec5 100644
--- a/benchmark/test/reference/distributed_solver.matrix.stdout
+++ b/benchmark/test/reference/distributed_solver.matrix.stdout
@@ -54,4 +54,4 @@
         "rows": 36,
         "cols": 36
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout
index 55dfb1dc428..0a844879c4f 100644
--- a/benchmark/test/reference/distributed_solver.profile.stdout
+++ b/benchmark/test/reference/distributed_solver.profile.stdout
@@ -30,4 +30,4 @@
         "rows": 125,
         "cols": 125
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout
index eed8d864388..458115e6ab2 100644
--- a/benchmark/test/reference/distributed_solver.simple.stdout
+++ b/benchmark/test/reference/distributed_solver.simple.stdout
@@ -56,4 +56,4 @@
         "rows": 125,
         "cols": 125
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout
index a6297e89b66..f5eba9461f7 100644
--- a/benchmark/test/reference/matrix_statistics.matrix.stdout
+++ b/benchmark/test/reference/matrix_statistics.matrix.stdout
@@ -36,4 +36,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout
index 923bbc9f962..23124781a7d 100644
--- a/benchmark/test/reference/matrix_statistics.simple.stdout
+++ b/benchmark/test/reference/matrix_statistics.simple.stdout
@@ -37,4 +37,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout
index 8998d5eaed7..209e115b557 100644
--- a/benchmark/test/reference/multi_vector_distributed.profile.stdout
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout
@@ -25,4 +25,4 @@
             }
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout
index a586a9bc57b..54745d81104 100644
--- a/benchmark/test/reference/multi_vector_distributed.simple.stdout
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout
@@ -25,4 +25,4 @@
             }
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout
index 51adb7383c3..742ec55c41d 100644
--- a/benchmark/test/reference/preconditioner.matrix.stdout
+++ b/benchmark/test/reference/preconditioner.matrix.stdout
@@ -28,4 +28,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout
index e33a6502eea..526349b55ad 100644
--- a/benchmark/test/reference/preconditioner.profile.stdout
+++ b/benchmark/test/reference/preconditioner.profile.stdout
@@ -21,4 +21,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout
index 06291228a1c..ed567dcbb13 100644
--- a/benchmark/test/reference/preconditioner.simple.stdout
+++ b/benchmark/test/reference/preconditioner.simple.stdout
@@ -29,4 +29,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout
index a87e78f7f66..594a3887921 100644
--- a/benchmark/test/reference/solver.matrix.stdout
+++ b/benchmark/test/reference/solver.matrix.stdout
@@ -52,4 +52,4 @@
         "rows": 36,
         "cols": 36
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout
index 906c74de5e7..c132ed1a572 100644
--- a/benchmark/test/reference/solver.profile.stdout
+++ b/benchmark/test/reference/solver.profile.stdout
@@ -29,4 +29,4 @@
         "rows": 125,
         "cols": 125
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout
index 5d127fe4b78..0ee0e4b9a4b 100644
--- a/benchmark/test/reference/solver.simple.stdout
+++ b/benchmark/test/reference/solver.simple.stdout
@@ -53,4 +53,4 @@
         "rows": 125,
         "cols": 125
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout
index 74fdbf98e7a..a50fa1159d9 100644
--- a/benchmark/test/reference/sparse_blas.matrix.stdout
+++ b/benchmark/test/reference/sparse_blas.matrix.stdout
@@ -21,4 +21,4 @@
         "cols": 36,
         "nonzeros": 208
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout
index e9d48fde23d..45cb7e2638a 100644
--- a/benchmark/test/reference/sparse_blas.profile.stdout
+++ b/benchmark/test/reference/sparse_blas.profile.stdout
@@ -15,4 +15,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout
index 3cc5f774ebf..a44f4f189b2 100644
--- a/benchmark/test/reference/sparse_blas.simple.stdout
+++ b/benchmark/test/reference/sparse_blas.simple.stdout
@@ -22,4 +22,4 @@
         "cols": 125,
         "nonzeros": 725
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout
index 4d03ce3cd07..ea5927ba148 100644
--- a/benchmark/test/reference/spmv.matrix.stdout
+++ b/benchmark/test/reference/spmv.matrix.stdout
@@ -17,4 +17,4 @@
             "spmv": "coo"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout
index 409a92d4e33..6e4701af719 100644
--- a/benchmark/test/reference/spmv.profile.stdout
+++ b/benchmark/test/reference/spmv.profile.stdout
@@ -17,4 +17,4 @@
             "spmv": "coo"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout
index 9601a15b331..38f2598c616 100644
--- a/benchmark/test/reference/spmv.simple.stdout
+++ b/benchmark/test/reference/spmv.simple.stdout
@@ -18,4 +18,4 @@
             "spmv": "coo"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout
index 8de6a68ae8a..bbef87d0b89 100644
--- a/benchmark/test/reference/spmv_distributed.profile.stdout
+++ b/benchmark/test/reference/spmv_distributed.profile.stdout
@@ -18,4 +18,4 @@
             "spmv": "csr-csr"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout
index f94e4b992a1..77bdef168d3 100644
--- a/benchmark/test/reference/spmv_distributed.simple.stdout
+++ b/benchmark/test/reference/spmv_distributed.simple.stdout
@@ -19,4 +19,4 @@
             "spmv": "csr-csr"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
index 1a07818df1f..62c4293e7c0 100644
--- a/benchmark/test/test_framework.py.in
+++ b/benchmark/test/test_framework.py.in
@@ -92,7 +92,8 @@ def sanitize_json_text(input: str) -> List[str]:
     """
 
     result = json.dumps(sanitize_json(json.loads(input)), indent=4)
-    return result.splitlines()
+    # json.dumps doesn't add a trailing newline
+    return result.splitlines() + [""]
 
 
 def sanitize_text(