diff --git a/Cargo.toml b/Cargo.toml index e40c0d09..14251fdc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,21 +21,24 @@ tokio = { version = "1.2", features = ["net", "rt"] } scoped-tls = "1.0.0" slab = "0.4.2" libc = "0.2.80" -io-uring = { version = "0.5.0", features = ["unstable"] } -socket2 = { version = "0.4.4", features = ["all"] } +io-uring = { version = "0.5.0", features = [ "unstable" ] } +socket2 = { version = "0.4.4", features = [ "all"] } bytes = { version = "1.0", optional = true } [dev-dependencies] tempfile = "3.2.0" tokio-test = "0.4.2" iai = "0.1.1" -criterion = "0.4.0" -tokio = { version = "1.21.2", features = [] } +futures = "0.3.25" +criterion = {version = "0.3.6", features = ["async"]} +pprof = {version = "0.10.1", features = ["flamegraph", "criterion"]} [package.metadata.docs.rs] all-features = true [profile.bench] +lto = "fat" +codegen = "1" debug = true [[bench]] diff --git a/benches/criterion/no_op.rs b/benches/criterion/no_op.rs index 998a2de3..3dc2589e 100644 --- a/benches/criterion/no_op.rs +++ b/benches/criterion/no_op.rs @@ -1,16 +1,23 @@ use criterion::{ - criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput, + criterion_group, criterion_main, BenchmarkId, Criterion, Throughput, }; -use std::time::{Duration, Instant}; +use pprof::criterion::{Output, PProfProfiler}; use tokio::task::JoinSet; +// To be upstreamed into Criterion, on next release (> 0.3.0) +struct AsyncRuntime(tokio_uring::Runtime); + +impl criterion::async_executor::AsyncExecutor for &AsyncRuntime { + fn block_on(&self, future: impl futures::Future) -> T { + self.0.block_on(future) + } +} + #[derive(Clone)] struct Options { iterations: usize, concurrency: usize, - sq_size: usize, - cq_size: usize, } impl Default for Options { @@ -18,55 +25,43 @@ impl Default for Options { Self { iterations: 100000, concurrency: 1, - sq_size: 128, - cq_size: 256, } } } -fn run_no_ops(opts: &Options, count: u64) -> Duration { - let mut ring_opts = tokio_uring::uring_builder(); - ring_opts - .setup_cqsize(opts.cq_size as _) - // .setup_sqpoll(10) - // .setup_sqpoll_cpu(1) - ; - - let mut m = Duration::ZERO; - - // Run the required number of iterations - for _ in 0..count { - m += tokio_uring::builder() - .entries(opts.sq_size as _) - .uring_builder(&ring_opts) - .start(async move { - let mut js = JoinSet::new(); - - for _ in 0..opts.iterations { - js.spawn_local(tokio_uring::no_op()); - } - - let start = Instant::now(); +async fn run_no_ops(opts: &Options) { + let mut js = JoinSet::new(); - while let Some(res) = js.join_next().await { - res.unwrap().unwrap(); - } + for _ in 0..opts.iterations { + js.spawn_local(tokio_uring::no_op()); + } - start.elapsed() - }) + while let Some(res) = js.join_next().await { + res.unwrap().unwrap(); } - m } fn bench(c: &mut Criterion) { let mut group = c.benchmark_group("no_op"); let mut opts = Options::default(); + + let mut ring_opts = tokio_uring::uring_builder(); + ring_opts + .setup_cqsize(256) + // .setup_sqpoll(10) + // .setup_sqpoll_cpu(1) + ; + + let mut builder = tokio_uring::builder(); + builder.entries(128).uring_builder(&ring_opts); + + let runtime = AsyncRuntime(tokio_uring::Runtime::new(&builder).unwrap()); + let runtime = &runtime; + for concurrency in [1, 32, 64, 256].iter() { opts.concurrency = *concurrency; // We perform long running benchmarks: this is the best mode - group.sampling_mode(SamplingMode::Flat); - group.throughput(Throughput::Elements(opts.iterations as u64)); group.bench_with_input( BenchmarkId::from_parameter(concurrency), @@ -74,12 +69,16 @@ fn bench(c: &mut Criterion) { |b, opts| { // Custom iterator used because we don't expose access to runtime, // which is required to do async benchmarking with criterion - b.iter_custom(move |iter| run_no_ops(opts, iter)); + b.to_async(runtime).iter(|| run_no_ops(opts)); }, ); } group.finish(); } -criterion_group!(benches, bench); +criterion_group! { + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench +} criterion_main!(benches);