From a6629b46bb4a486a16a9fa3660a79567ef7efea6 Mon Sep 17 00:00:00 2001 From: EbbDrop Date: Mon, 23 Dec 2024 01:06:53 +0100 Subject: [PATCH] Try to vectorize --- src/day22.rs | 200 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 161 insertions(+), 39 deletions(-) diff --git a/src/day22.rs b/src/day22.rs index 0739c00..6a3c224 100644 --- a/src/day22.rs +++ b/src/day22.rs @@ -1,5 +1,5 @@ use core::str; -use std::mem::transmute; +use std::{arch::x86_64::*, mem::transmute}; use aoc_runner_derive::aoc; @@ -46,70 +46,192 @@ pub fn part1(s: &str) -> u64 { const SEQUENCES: usize = 18 * 18 * 18 * 18; +#[inline(always)] +unsafe fn vmod10(a: __m256i) -> __m256i { + // Algo from LLVM + let prod02 = _mm256_mul_epu32(a, _mm256_set1_epi32(3435973837u32 as i32)); + let prod13 = _mm256_mul_epu32( + _mm256_shuffle_epi32::<0xf5>(a), + _mm256_set1_epi32(3435973837u32 as i32), + ); + let d = _mm256_unpackhi_epi64( + _mm256_unpacklo_epi32(prod02, prod13), + _mm256_unpackhi_epi32(prod02, prod13), + ); + + let d = _mm256_srli_epi32::<3>(d); + let c = _mm256_mullo_epi32(d, _mm256_set1_epi32(10)); + _mm256_sub_epi32(a, c) +} + +#[inline(always)] +unsafe fn vmod104976(a: __m256i) -> __m256i { + // Algo from LLVM + let prod02 = _mm256_mul_epu32(a, _mm256_set1_epi32(2681326939u32 as i32)); + let prod13 = _mm256_mul_epu32( + _mm256_shuffle_epi32::<0xf5>(a), + _mm256_set1_epi32(2681326939u32 as i32), + ); + let d = _mm256_unpackhi_epi64( + _mm256_unpacklo_epi32(prod02, prod13), + _mm256_unpackhi_epi32(prod02, prod13), + ); + + let d = _mm256_srli_epi32::<16>(d); + let c = _mm256_mullo_epi32(d, _mm256_set1_epi32(104976)); + _mm256_sub_epi32(a, c) +} + +static mut DONE: [[u16; 104976]; 8] = [[0u16; SEQUENCES]; 8]; + #[aoc(day22, part2)] -pub fn part2(s: &str) -> i16 { +pub fn part2(s: &str) -> i32 { let s = s.as_bytes(); let mut sequences = [0; SEQUENCES]; - let mut done = [0u16; SEQUENCES]; + let done = unsafe { &mut DONE }; + + for j in 0..8 { + done[j].fill(0); + } let mut i = 0; let mut monky = 1; unsafe { while i < s.len() { - #[cfg(not(test))] - let mut sn = (*s.get_unchecked(i + 0) as u32) * 100000 - + (*s.get_unchecked(i + 1) as u32) * 10000 - + (*s.get_unchecked(i + 2) as u32) * 1000 - + (*s.get_unchecked(i + 3) as u32) * 100 - + (*s.get_unchecked(i + 4) as u32) * 10 - + (*s.get_unchecked(i + 5) as u32) * 1 - - (b'0' as u32 * 111_111); - #[cfg(not(test))] - { - i += 6; - } + let mut sns: __m256i = _mm256_setzero_si256(); + let mut sns_len = 0; + while i < s.len() && sns_len < 8 { + #[cfg(not(test))] + let mut sn = (*s.get_unchecked(i + 0) as u32) * 100000 + + (*s.get_unchecked(i + 1) as u32) * 10000 + + (*s.get_unchecked(i + 2) as u32) * 1000 + + (*s.get_unchecked(i + 3) as u32) * 100 + + (*s.get_unchecked(i + 4) as u32) * 10 + + (*s.get_unchecked(i + 5) as u32) * 1 + - (b'0' as u32 * 111_111); + #[cfg(not(test))] + { + i += 6; + } - #[cfg(test)] - let mut sn = 0; - while *s.get_unchecked(i) != b'\n' { - sn *= 10; - sn += (s.get_unchecked(i) - b'0') as u32; + #[cfg(test)] + let mut sn = 0; + while *s.get_unchecked(i) != b'\n' { + sn *= 10; + sn += (s.get_unchecked(i) - b'0') as u32; + i += 1; + } i += 1; + sns = _mm256_permutevar8x32_epi32(sns, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6)); + sns = _mm256_blend_epi32::<1>(sns, _mm256_set1_epi32(sn as i32)); + sns_len += 1; } - i += 1; - let mut diffs = 0; - let mut prev = sn % 10; + let mut diffs = _mm256_setzero_si256(); + let mut prev = vmod10(sns); for _ in 0..3 { - sn = ((sn as u64 * 64) % MAX as u64) as u32 ^ sn; - sn = (sn / 32) ^ sn; - sn = ((sn as u64 * 2048) % MAX as u64) as u32 ^ sn; - let price = sn % 10; - let diff = price + 9 - prev; - diffs = diffs * 18 + diff; + let i = _mm256_slli_epi32::<6>(sns); + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); + sns = _mm256_xor_si256(i, sns); + + let i = _mm256_srli_epi32::<5>(sns); + sns = _mm256_xor_si256(i, sns); + let i = _mm256_slli_epi32::<11>(sns); + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); + sns = _mm256_xor_si256(i, sns); + + let price = vmod10(sns); + let diff = _mm256_sub_epi32(_mm256_add_epi32(price, _mm256_set1_epi32(9)), prev); + diffs = _mm256_add_epi32(_mm256_mullo_epi32(diffs, _mm256_set1_epi32(18)), diff); prev = price; } for _ in 4..2000 { - sn = ((sn as u64 * 64) % MAX as u64) as u32 ^ sn; - sn = (sn / 32) ^ sn; - sn = ((sn as u64 * 2048) % MAX as u64) as u32 ^ sn; - let price = sn % 10; - let diff = price + 9 - prev; - diffs = (diffs * 18 + diff) % SEQUENCES as u32; + let i = _mm256_slli_epi32::<6>(sns); + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); + sns = _mm256_xor_si256(i, sns); + let i = _mm256_srli_epi32::<5>(sns); + sns = _mm256_xor_si256(i, sns); + let i = _mm256_slli_epi32::<11>(sns); + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); + sns = _mm256_xor_si256(i, sns); + + let price = vmod10(sns); + let diff = _mm256_sub_epi32(_mm256_add_epi32(price, _mm256_set1_epi32(9)), prev); + diffs = _mm256_add_epi32(_mm256_mullo_epi32(diffs, _mm256_set1_epi32(18)), diff); + diffs = vmod104976(diffs); + + let diff_i = _mm256_extract_epi32::<0>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[0][diff_i] != monky + 0 { + let price = _mm256_extract_epi32::<0>(price); + sequences[diff_i] += price; + + done[0][diff_i] = monky + 0; + } + let diff_i = _mm256_extract_epi32::<1>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[1][diff_i] != monky + 1 { + let price = _mm256_extract_epi32::<1>(price); + sequences[diff_i] += price; - if done[diffs as usize] != monky { - sequences[diffs as usize] += price as i16; + done[1][diff_i] = monky + 1; + } + let diff_i = _mm256_extract_epi32::<2>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[2][diff_i] != monky + 2 { + let price = _mm256_extract_epi32::<2>(price); + sequences[diff_i] += price; + + done[2][diff_i] = monky + 2; + } + let diff_i = _mm256_extract_epi32::<3>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[3][diff_i] != monky + 3 { + let price = _mm256_extract_epi32::<3>(price); + sequences[diff_i] += price; + + done[3][diff_i] = monky + 3; + } + let diff_i = _mm256_extract_epi32::<4>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[4][diff_i] != monky + 4 { + let price = _mm256_extract_epi32::<4>(price); + sequences[diff_i] += price; + + done[4][diff_i] = monky + 4; + } + let diff_i = _mm256_extract_epi32::<5>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[5][diff_i] != monky + 5 { + let price = _mm256_extract_epi32::<5>(price); + sequences[diff_i] += price; + + done[5][diff_i] = monky + 5; + } + let diff_i = _mm256_extract_epi32::<6>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[6][diff_i] != monky + 6 { + let price = _mm256_extract_epi32::<6>(price); + sequences[diff_i] += price; + + done[6][diff_i] = monky + 6; + } + let diff_i = _mm256_extract_epi32::<7>(diffs) as usize; + std::hint::assert_unchecked(diff_i < SEQUENCES); + if done[7][diff_i] != monky + 7 { + let price = _mm256_extract_epi32::<7>(price); + sequences[diff_i] += price; - done[diffs as usize] = monky; + done[7][diff_i] = monky + 7; } prev = price; } - monky += 1; + monky += 8; } sequences.into_iter().max().unwrap_unchecked()