diff --git a/Cargo.toml b/Cargo.toml
index e40c0d09..14251fdc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,21 +21,24 @@ tokio = { version = "1.2", features = ["net", "rt"] }
 scoped-tls = "1.0.0"
 slab = "0.4.2"
 libc = "0.2.80"
-io-uring = { version = "0.5.0", features = ["unstable"] }
-socket2 = { version = "0.4.4", features = ["all"] }
+io-uring = { version = "0.5.0", features = [ "unstable" ] }
+socket2 = { version = "0.4.4", features = [ "all"] }
 bytes = { version = "1.0", optional = true }
 
 [dev-dependencies]
 tempfile = "3.2.0"
 tokio-test = "0.4.2"
 iai = "0.1.1"
-criterion = "0.4.0"
-tokio = { version = "1.21.2", features = [] }
+futures = "0.3.25"
+criterion = {version = "0.3.6", features = ["async"]}
+pprof = {version = "0.10.1", features = ["flamegraph", "criterion"]}
 
 [package.metadata.docs.rs]
 all-features = true
 
 [profile.bench]
+lto = "fat"
+codegen = "1"
 debug = true
 
 [[bench]]
diff --git a/benches/criterion/no_op.rs b/benches/criterion/no_op.rs
index 998a2de3..3dc2589e 100644
--- a/benches/criterion/no_op.rs
+++ b/benches/criterion/no_op.rs
@@ -1,16 +1,23 @@
 use criterion::{
-    criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput,
+    criterion_group, criterion_main, BenchmarkId, Criterion, Throughput,
 };
-use std::time::{Duration, Instant};
+use pprof::criterion::{Output, PProfProfiler};
 
 use tokio::task::JoinSet;
 
+// To be upstreamed into Criterion, on next release (> 0.3.0)
+struct AsyncRuntime(tokio_uring::Runtime);
+
+impl criterion::async_executor::AsyncExecutor for &AsyncRuntime {
+    fn block_on<T>(&self, future: impl futures::Future<Output = T>) -> T {
+        self.0.block_on(future)
+    }
+}
+
 #[derive(Clone)]
 struct Options {
     iterations: usize,
     concurrency: usize,
-    sq_size: usize,
-    cq_size: usize,
 }
 
 impl Default for Options {
@@ -18,55 +25,43 @@ impl Default for Options {
         Self {
             iterations: 100000,
             concurrency: 1,
-            sq_size: 128,
-            cq_size: 256,
         }
     }
 }
 
-fn run_no_ops(opts: &Options, count: u64) -> Duration {
-    let mut ring_opts = tokio_uring::uring_builder();
-    ring_opts
-        .setup_cqsize(opts.cq_size as _)
-        // .setup_sqpoll(10)
-        // .setup_sqpoll_cpu(1)
-        ;
-
-    let mut m = Duration::ZERO;
-
-    // Run the required number of iterations
-    for _ in 0..count {
-        m += tokio_uring::builder()
-            .entries(opts.sq_size as _)
-            .uring_builder(&ring_opts)
-            .start(async move {
-                let mut js = JoinSet::new();
-
-                for _ in 0..opts.iterations {
-                    js.spawn_local(tokio_uring::no_op());
-                }
-
-                let start = Instant::now();
+async fn run_no_ops(opts: &Options) {
+    let mut js = JoinSet::new();
 
-                while let Some(res) = js.join_next().await {
-                    res.unwrap().unwrap();
-                }
+    for _ in 0..opts.iterations {
+        js.spawn_local(tokio_uring::no_op());
+    }
 
-                start.elapsed()
-            })
+    while let Some(res) = js.join_next().await {
+        res.unwrap().unwrap();
     }
-    m
 }
 
 fn bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("no_op");
     let mut opts = Options::default();
+
+    let mut ring_opts = tokio_uring::uring_builder();
+    ring_opts
+        .setup_cqsize(256)
+        // .setup_sqpoll(10)
+        // .setup_sqpoll_cpu(1)
+        ;
+
+    let mut builder = tokio_uring::builder();
+    builder.entries(128).uring_builder(&ring_opts);
+
+    let runtime = AsyncRuntime(tokio_uring::Runtime::new(&builder).unwrap());
+    let runtime = &runtime;
+
     for concurrency in [1, 32, 64, 256].iter() {
         opts.concurrency = *concurrency;
 
         // We perform long running benchmarks: this is the best mode
-        group.sampling_mode(SamplingMode::Flat);
-
         group.throughput(Throughput::Elements(opts.iterations as u64));
         group.bench_with_input(
             BenchmarkId::from_parameter(concurrency),
@@ -74,12 +69,16 @@ fn bench(c: &mut Criterion) {
             |b, opts| {
                 // Custom iterator used because we don't expose access to runtime,
                 // which is required to do async benchmarking with criterion
-                b.iter_custom(move |iter| run_no_ops(opts, iter));
+                b.to_async(runtime).iter(|| run_no_ops(opts));
             },
         );
     }
     group.finish();
 }
 
-criterion_group!(benches, bench);
+criterion_group! {
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench
+}
 criterion_main!(benches);