From 180dd5a22729cb83413979e498e452504cc183de Mon Sep 17 00:00:00 2001 From: Peter Faiman Date: Sat, 15 Jul 2023 02:30:08 -0700 Subject: [PATCH 1/3] fix simd accumulator overflow The intrinsics version (`opt4_simd`) has a bug: >```rust >// Flush accumulator every 256 iterations to avoid overflow >if block_i % (u8::MAX as usize + 1) == u8::MAX as usize { >``` > >The function is logically the same as our previous algorithm, with the only change being that of flushing our vector accumulator every 256 iterations to prevent overflowing each u8 lane. Flushing every 256 iterations isn't enough, because you can potentially encounter 256 `'s'` in 256 iterations, thus overflowing the max of 255. Using the function definitions from the article: ```rust fn main() { let input = "s".repeat(1024*1024); println!("{}", baseline(&input)); println!("{}", opt4_simd(&input)); } ``` Produces: ``` 1048576 -1040384 ``` The fix, performance untested: ```rust // Flush accumulator every 255 iterations to avoid overflow if block_i % u8::MAX as usize == (u8::MAX - 1) as usize { ``` Correctly produces: ``` 1048576 1048576 ``` https://lobste.rs/s/sqn7m0/n_times_faster_than_c_where_n_128#c_iooycv --- src/lib.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 341c85f..ceb5e6d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,7 +60,7 @@ pub fn opt4_simd(input: &str) -> i64 { let input_v = vld1q_u8(input[block_i * N_LANES..].as_ptr()); let eq_s_v = vandq_u8(input_v, one_v); acc_v = vaddq_u8(acc_v, eq_s_v); - if block_i % (u8::MAX as usize + 1) == u8::MAX as usize { + if block_i % u8::MAX as usize == (u8::MAX - 1) as usize { res += vaddlvq_u8(acc_v) as i64; acc_v = vmovq_n_u8(0); } @@ -92,7 +92,7 @@ macro_rules! simd_unrolled { let v_eq_s~I= vandq_u8(v_input~I, one_v); v_acc~I = vaddq_u8(v_acc~I, v_eq_s~I); }); - if block_i % (u8::MAX as usize + 1) == u8::MAX as usize { + if block_i % u8::MAX as usize == (u8::MAX - 1) as usize { seq!(I in 0..$unroll_factor { res += vaddlvq_u8(v_acc~I) as i64; v_acc~I = vmovq_n_u8(0); @@ -187,4 +187,11 @@ mod tests { let expected = baseline_unicode(&input); assert_eq_all!(expected, &input); } + + #[test] + fn test_all_s() { + let expected = 1024 * 1024; + let input = "s".repeat(expected); + assert_eq_all!(expected as i64, &input); + } } From 189fe90786cdcfb4f3d6d17cd4d4aa9b6b75b17f Mon Sep 17 00:00:00 2001 From: Peter Faiman Date: Sat, 15 Jul 2023 02:42:54 -0700 Subject: [PATCH 2/3] fix chunked overflow $ RUST_BACKTRACE=1 cargo test Finished test [unoptimized + debuginfo] target(s) in 0.03s Running unittests src/lib.rs (target/debug/deps/n_times_faster_than_c-39ece82b67ddb14c) running 3 tests test tests::test_simple ... ok test tests::test_all_s ... FAILED test tests::test_large ... ok v_acc~I = vaddq_u8(v_acc~I, v_eq_s~I); failures: ---- tests::test_all_s stdout ---- thread 'tests::test_all_s' panicked at 'attempt to add with overflow', /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/accum.rs:149:1 stack backtrace: 0: rust_begin_unwind at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs:593:5 1: core::panicking::panic_fmt at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:67:14 2: core::panicking::panic at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:117:5 3: ::sum::{{closure}} at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/accum.rs:53:28 4: core::iter::adapters::map::map_fold::{{closure}} at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/adapters/map.rs:84:21 5: core::iter::traits::iterator::Iterator::fold at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/iterator.rs:2481:21 6: as core::iter::traits::iterator::Iterator>::fold at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/adapters/map.rs:124:9 7: ::sum at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/accum.rs:50:17 8: core::iter::traits::iterator::Iterator::sum at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/iterator.rs:3476:9 9: n_times_faster_than_c::opt6_chunk_count::{{closure}} at ./src/lib.rs:125:22 10: core::iter::adapters::map::map_fold::{{closure}} at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/adapters/map.rs:84:28 11: core::iter::traits::iterator::Iterator::fold at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/iterator.rs:2481:21 12: as core::iter::traits::iterator::Iterator>::fold at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/adapters/map.rs:124:9 13: as core::iter::traits::iterator::Iterator>::fold at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/adapters/map.rs:124:9 14: ::sum at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/accum.rs:50:17 15: core::iter::traits::iterator::Iterator::sum at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/iter/traits/iterator.rs:3476:9 16: n_times_faster_than_c::opt6_chunk_count at ./src/lib.rs:122:15 17: n_times_faster_than_c::tests::test_all_s at ./src/lib.rs:195:9 18: n_times_faster_than_c::tests::test_all_s::{{closure}} at ./src/lib.rs:192:21 19: core::ops::function::FnOnce::call_once at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs:250:5 20: core::ops::function::FnOnce::call_once at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs:250:5 note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. failures: tests::test_all_s test result: FAILED. 2 passed; 1 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.29s res = (2 * res) - n_simd_elems as i64; error: test failed, to rerun pass `--lib` --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ceb5e6d..9b4ffce 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -121,7 +121,7 @@ simd_unrolled!(opt5_simd_unrolled_16x, 16); pub fn opt6_chunk_count(input: &str) -> i64 { let n_s = input .as_bytes() - .chunks(256) + .chunks(192) .map(|chunk| chunk.iter().map(|&b| b & 1).sum::()) .map(|chunk_total| chunk_total as i64) .sum::(); @@ -131,7 +131,7 @@ pub fn opt6_chunk_count(input: &str) -> i64 { /// Credit to u/Sharlinator /// https://www.reddit.com/r/rust/comments/14yvlc9/comment/jrwt29t pub fn opt6_chunk_exact_count(input: &str) -> i64 { - let iter = input.as_bytes().chunks_exact(256); + let iter = input.as_bytes().chunks_exact(192); let rest = iter.remainder(); let mut n_s = iter .map(|chunk| chunk.iter().map(|&b| b & 1).sum::()) From bf23c93d20fa7a5d36321e5a51221443502f0884 Mon Sep 17 00:00:00 2001 From: Peter Faiman Date: Sat, 15 Jul 2023 04:19:29 -0700 Subject: [PATCH 3/3] flush simd accumulator every 128 iterations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flushing every 255 iterations regressed performance by ~5%, but this eliminates the regression. run_switches/opt5_simd_unrolled_10x time: [24.636 µs 24.641 µs 24.647 µs] thrpt: [37.786 GiB/s 37.795 GiB/s 37.804 GiB/s] change: time: [+0.2771% +0.5216% +0.7572%] (p = 0.00 < 0.05) thrpt: [-0.7516% -0.5189% -0.2763%] Change within noise threshold. Found 8 outliers among 100 measurements (8.00%) 2 (2.00%) high mild 6 (6.00%) high severe --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9b4ffce..f618e4b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,7 +60,7 @@ pub fn opt4_simd(input: &str) -> i64 { let input_v = vld1q_u8(input[block_i * N_LANES..].as_ptr()); let eq_s_v = vandq_u8(input_v, one_v); acc_v = vaddq_u8(acc_v, eq_s_v); - if block_i % u8::MAX as usize == (u8::MAX - 1) as usize { + if block_i % 128 == 127 { res += vaddlvq_u8(acc_v) as i64; acc_v = vmovq_n_u8(0); } @@ -92,7 +92,7 @@ macro_rules! simd_unrolled { let v_eq_s~I= vandq_u8(v_input~I, one_v); v_acc~I = vaddq_u8(v_acc~I, v_eq_s~I); }); - if block_i % u8::MAX as usize == (u8::MAX - 1) as usize { + if block_i % 128 == 127 { seq!(I in 0..$unroll_factor { res += vaddlvq_u8(v_acc~I) as i64; v_acc~I = vmovq_n_u8(0);