diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index cfa56ef81fe..d906e9f9e12 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -127,9 +127,12 @@ void apply_sparse_blas(const char* operation_name,
                               allocator);
             auto gen_logger = create_operations_logger(
                 FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                test_case[operation_name]["components"], allocator, 1);
+                test_case[operation_name]["components"], allocator,
+                repetitions);
             exec->add_logger(gen_logger);
-            op->run();
+            for (unsigned i = 0; i < repetitions; i++) {
+                op->run();
+            }
             exec->remove_logger(gen_logger);
         }
         op->write_stats(test_case[operation_name], allocator);
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 5c6d849fe36..335ed687002 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ostream>
 #include <random>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -58,6 +59,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <rapidjson/prettywriter.h>
 
 
+#include <ginkgo/core/base/memory.hpp>
+
+
 #include "benchmark/utils/json.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
@@ -69,6 +73,10 @@ DEFINE_string(executor, "reference",
               "The executor used to run the benchmarks, one of: reference, "
               "omp, cuda, hip");
 
+DEFINE_string(allocator, "default",
+              "The allocator used in the executor. Only relevant for CUDA and "
+              "HIP executors, one of: default, async, host, unified");
+
 DEFINE_uint32(device_id, 0, "ID of the device where to run the code");
 
 DEFINE_bool(overwrite, false,
@@ -329,6 +337,40 @@ void backup_results(rapidjson::Document& results)
 }
 
 
+inline std::shared_ptr<gko::CudaAllocatorBase> create_cuda_allocator()
+{
+    std::string flag{FLAGS_allocator};
+    if (flag == "default") {
+        return std::make_shared<gko::CudaAllocator>();
+    } else if (flag == "async") {
+        return std::make_shared<gko::CudaAsyncAllocator>(nullptr);
+    } else if (flag == "unified") {
+        return std::make_shared<gko::CudaUnifiedAllocator>(FLAGS_device_id);
+    } else if (flag == "host") {
+        return std::make_shared<gko::CudaHostAllocator>(FLAGS_device_id);
+    } else {
+        throw std::runtime_error{"Unknown allocator type " + flag};
+    }
+}
+
+
+inline std::shared_ptr<gko::HipAllocatorBase> create_hip_allocator()
+{
+    std::string flag{FLAGS_allocator};
+    if (flag == "default") {
+        return std::make_shared<gko::HipAllocator>();
+    } else if (flag == "async") {
+        return std::make_shared<gko::HipAsyncAllocator>(nullptr);
+    } else if (flag == "unified") {
+        return std::make_shared<gko::HipUnifiedAllocator>(FLAGS_device_id);
+    } else if (flag == "host") {
+        return std::make_shared<gko::HipHostAllocator>(FLAGS_device_id);
+    } else {
+        throw std::runtime_error{"Unknown allocator type " + flag};
+    }
+}
+
+
 // executor mapping
 const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
     executor_factory{
@@ -337,12 +379,14 @@ const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
         {"cuda",
          [](bool) {
              return gko::CudaExecutor::create(FLAGS_device_id,
-                                              gko::OmpExecutor::create());
+                                              gko::OmpExecutor::create(),
+                                              create_cuda_allocator());
          }},
         {"hip",
          [](bool) {
              return gko::HipExecutor::create(FLAGS_device_id,
-                                             gko::OmpExecutor::create());
+                                             gko::OmpExecutor::create(),
+                                             create_hip_allocator());
          }},
         {"dpcpp", [](bool use_gpu_timer) {
              auto property = dpcpp_queue_property::in_order;
@@ -369,14 +413,16 @@ const std::map<std::string,
              FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
                  comm, gko::CudaExecutor::get_num_devices());
              return gko::CudaExecutor::create(FLAGS_device_id,
-                                              gko::ReferenceExecutor::create());
+                                              gko::ReferenceExecutor::create(),
+                                              create_cuda_allocator());
          }},
         {"hip",
          [](MPI_Comm comm) {
              FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
                  comm, gko::HipExecutor::get_num_devices());
              return gko::HipExecutor::create(FLAGS_device_id,
-                                             gko::ReferenceExecutor::create());
+                                             gko::ReferenceExecutor::create(),
+                                             create_hip_allocator());
          }},
         {"dpcpp", [](MPI_Comm comm) {
              if (gko::DpcppExecutor::get_num_devices("gpu")) {
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index 03ab12deb46..ff644a5f05f 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -75,6 +75,10 @@ bool CudaAsyncAllocator::check_environment(int device_id,
     GKO_NOT_COMPILED(cuda);
 
 
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
+    GKO_NOT_COMPILED(cuda);
+
+
 CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
     GKO_NOT_COMPILED(cuda);
 
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index dec1de15933..521b2590626 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -76,6 +76,9 @@ bool HipAsyncAllocator::check_environment(int device_id,
     GKO_NOT_COMPILED(hip);
 
 
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id) GKO_NOT_COMPILED(hip);
+
+
 HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
     GKO_NOT_COMPILED(hip);
 
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index f605d9135ea..b5bfb14ac74 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/memory.hpp>
 
 
+#include <cuda.h>
 #include <cuda_runtime.h>