From 46aceedc47c426d6b67b35747cf2e4c6655c25fa Mon Sep 17 00:00:00 2001 From: Tony Sifkarovski Date: Fri, 17 Nov 2017 03:22:18 -0500 Subject: [PATCH] [avx2] add _mm_256_cvtepu{8,16,32}_epi{16,32,64} (#192) --- src/lib.rs | 3 ++ src/x86/avx2.rs | 112 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 102 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e8dfc558d64ad..6e41a7026f46b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,6 +173,9 @@ mod v32 { define_ty! { i8x4, i8, i8, i8, i8 } define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 } + define_ty! { u8x4, u8, u8, u8, u8 } + define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 } + define_casts!((i8x4, i32x4, as_i32x4), (i16x2, i64x2, as_i64x2)); } diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index acad7d274791c..66475fcc77092 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -583,26 +583,66 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: i8x16) -> i32x8 { simd_cast::<::v64::i8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])) } -/// An i8x4 type is pretty useless, but we need it as an intermediate type in -/// _mm256_cvtepi8_epi64. -#[repr(simd)] -#[allow(non_camel_case_types)] -struct i8x4(i8, i8, i8, i8); - /// Sign-extend 8-bit integers to 64-bit integers. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmovsxbq))] pub unsafe fn _mm256_cvtepi8_epi64(a: i8x16) -> i64x4 { - simd_cast::(simd_shuffle4(a, a, [0, 1, 2, 3])) + simd_cast::<::v32::i8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) +} + +/// Zero-extend the lower four unsigned 16-bit integers in `a` to 32-bit +/// integers. The upper four elements of `a` are unused. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpmovzxwd))] +pub unsafe fn _mm256_cvtepu16_epi32(a: u16x8) -> i32x8 { + simd_cast(a) +} + +/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit +/// integers. The upper four elements of `a` are unused. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpmovzxwq))] +pub unsafe fn _mm256_cvtepu16_epi64(a: u16x8) -> i64x4 { + simd_cast::<::v64::u16x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) +} + +/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpmovzxdq))] +pub unsafe fn _mm256_cvtepu32_epi64(a: u32x4) -> i64x4 { + simd_cast(a) +} + +/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpmovzxbw))] +pub unsafe fn _mm256_cvtepu8_epi16(a: u8x16) -> i16x16 { + simd_cast(a) +} + +/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit +/// integers. The upper eight elements of `a` are unused. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpmovzxbd))] +pub unsafe fn _mm256_cvtepu8_epi32(a: u8x16) -> i32x8 { + simd_cast::<::v64::u8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])) +} + +/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit +/// integers. The upper twelve elements of `a` are unused. +#[inline(always)] +#[target_feature = "+avx2"] +#[cfg_attr(test, assert_instr(vpmovzxbq))] +pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 { + simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) } -// TODO _mm256_cvtepu16_epi32 -// TODO _mm256_cvtepu16_epi64 -// TODO _mm256_cvtepu32_epi64 -// TODO _mm256_cvtepu8_epi16 -// TODO _mm256_cvtepu8_epi32 -// TODO _mm256_cvtepu8_epi64 // TODO _m128i _mm256_extracti128_si256 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. @@ -2738,6 +2778,52 @@ mod tests { assert_eq!(r, avx2::_mm256_cvtepi32_epi64(a)); } + #[simd_test = "avx2"] + unsafe fn _mm256_cvtepu16_epi32() { + let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let r = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq!(r, avx2::_mm256_cvtepu16_epi32(a)); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_cvtepu16_epi64() { + let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let r = i64x4::new(0, 1, 2, 3); + assert_eq!(r, avx2::_mm256_cvtepu16_epi64(a)); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_cvtepu32_epi64() { + let a = u32x4::new(0, 1, 2, 3); + let r = i64x4::new(0, 1, 2, 3); + assert_eq!(r, avx2::_mm256_cvtepu32_epi64(a)); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_cvtepu8_epi16() { + let a = + u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = + i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq!(r, avx2::_mm256_cvtepu8_epi16(a)); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_cvtepu8_epi32() { + let a = + u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq!(r, avx2::_mm256_cvtepu8_epi32(a)); + } + + #[simd_test = "avx2"] + unsafe fn _mm256_cvtepu8_epi64() { + let a = + u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = i64x4::new(0, 1, 2, 3); + assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a)); + } + #[simd_test = "avx2"] unsafe fn _mm256_hadd_epi16() { let a = i16x16::splat(2);