diff --git a/coresimd/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs index 22da044b32205..64f4d568c4af2 100644 --- a/coresimd/src/x86/i586/sse2.rs +++ b/coresimd/src/x86/i586/sse2.rs @@ -2333,12 +2333,6 @@ mod tests { use x86::*; use v128::*; - // not actually an intrinsics in SSE2 but useful in the tests below - #[target_feature = "+sse2"] - unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { - _mm_set_epi64x(b, a) - } - #[simd_test = "sse2"] unsafe fn test_mm_pause() { _mm_pause(); diff --git a/coresimd/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs index 60f972f4d08ab..b5c50fbb65375 100644 --- a/coresimd/src/x86/i586/sse41.rs +++ b/coresimd/src/x86/i586/sse41.rs @@ -4,9 +4,10 @@ use core::mem; #[cfg(test)] use stdsimd_test::assert_instr; -use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8}; +use simd_llvm::*; use v128::*; +use x86::*; // SSE4 rounding constans /// round to nearest @@ -49,8 +50,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pblendvb))] -pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 { - pblendvb(a, b, mask) +pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { + mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) } /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`. @@ -61,11 +62,13 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))] -pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: i32) -> i16x8 { +pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let a = a.as_i16x8(); + let b = b.as_i16x8(); macro_rules! call { ($imm8:expr) => { pblendw(a, b, $imm8) } } - constify_imm8!(imm8, call) + mem::transmute(constify_imm8!(imm8, call)) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -73,7 +76,7 @@ pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: i32) -> i16x8 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(blendvpd))] -pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 { +pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { blendvpd(a, b, mask) } @@ -82,7 +85,7 @@ pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(blendvps))] -pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 { +pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { blendvps(a, b, mask) } @@ -91,7 +94,7 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))] -pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: i32) -> f64x2 { +pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d { macro_rules! call { ($imm2:expr) => { blendpd(a, b, $imm2) } } @@ -103,7 +106,7 @@ pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: i32) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))] -pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: i32) -> f32x4 { +pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { blendps(a, b, $imm4) } } @@ -116,8 +119,8 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: i32) -> f32x4 { #[target_feature = "+sse4.1"] // TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))] -pub unsafe fn _mm_extract_ps(a: f32x4, imm8: i32) -> i32 { - mem::transmute(a.extract(imm8 as u32 & 0b11)) +pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 { + mem::transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11)) } /// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit @@ -127,9 +130,9 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: i32) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))] -pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: i32) -> i32 { +pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 { let imm8 = (imm8 & 15) as u32; - (a.extract_unchecked(imm8) as i32) & 0xFF + simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32 } /// Extract an 32-bit integer from `a` selected with `imm8` @@ -137,9 +140,9 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: i32) -> i32 { #[target_feature = "+sse4.1"] // TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))] -pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: i32) -> i32 { +pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 { let imm8 = (imm8 & 3) as u32; - a.extract_unchecked(imm8) as i32 + simd_extract::<_, i32>(a.as_i32x4(), imm8) } /// Select a single value in `a` to store at some position in `b`, @@ -167,7 +170,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: i32) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))] -pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 { +pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { macro_rules! call { ($imm8:expr) => { insertps(a, b, $imm8) } } @@ -179,8 +182,8 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))] -pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: i32) -> i8x16 { - a.replace((imm8 & 0b1111) as u32, i) +pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i { + mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i)) } /// Return a copy of `a` with the 32-bit integer from `i` inserted at a @@ -188,8 +191,8 @@ pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: i32) -> i8x16 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))] -pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: i32) -> i32x4 { - a.replace((imm8 & 0b11) as u32, i) +pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i { + mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i)) } /// Compare packed 8-bit integers in `a` and `b` and return packed maximum @@ -197,8 +200,8 @@ pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: i32) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmaxsb))] -pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 { - pmaxsb(a, b) +pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pmaxsb(a.as_i8x16(), b.as_i8x16())) } /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed @@ -206,8 +209,8 @@ pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmaxuw))] -pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 { - pmaxuw(a, b) +pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pmaxuw(a.as_u16x8(), b.as_u16x8())) } /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum @@ -215,8 +218,8 @@ pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmaxsd))] -pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 { - pmaxsd(a, b) +pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pmaxsd(a.as_i32x4(), b.as_i32x4())) } /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed @@ -224,8 +227,8 @@ pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmaxud))] -pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 { - pmaxud(a, b) +pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pmaxud(a.as_u32x4(), b.as_u32x4())) } /// Compare packed 8-bit integers in `a` and `b` and return packed minimum @@ -233,8 +236,8 @@ pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pminsb))] -pub unsafe fn _mm_min_epi8(a: i8x16, b: i8x16) -> i8x16 { - pminsb(a, b) +pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pminsb(a.as_i8x16(), b.as_i8x16())) } /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed @@ -242,8 +245,8 @@ pub unsafe fn _mm_min_epi8(a: i8x16, b: i8x16) -> i8x16 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pminuw))] -pub unsafe fn _mm_min_epu16(a: u16x8, b: u16x8) -> u16x8 { - pminuw(a, b) +pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pminuw(a.as_u16x8(), b.as_u16x8())) } /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum @@ -251,8 +254,8 @@ pub unsafe fn _mm_min_epu16(a: u16x8, b: u16x8) -> u16x8 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pminsd))] -pub unsafe fn _mm_min_epi32(a: i32x4, b: i32x4) -> i32x4 { - pminsd(a, b) +pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pminsd(a.as_i32x4(), b.as_i32x4())) } /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed @@ -260,8 +263,8 @@ pub unsafe fn _mm_min_epi32(a: i32x4, b: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pminud))] -pub unsafe fn _mm_min_epu32(a: u32x4, b: u32x4) -> u32x4 { - pminud(a, b) +pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pminud(a.as_u32x4(), b.as_u32x4())) } /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -269,32 +272,36 @@ pub unsafe fn _mm_min_epu32(a: u32x4, b: u32x4) -> u32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(packusdw))] -pub unsafe fn _mm_packus_epi32(a: i32x4, b: i32x4) -> u16x8 { - packusdw(a, b) +pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(packusdw(a.as_i32x4(), b.as_i32x4())) } /// Compare packed 64-bit integers in `a` and `b` for equality #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pcmpeqq))] -pub unsafe fn _mm_cmpeq_epi64(a: i64x2, b: i64x2) -> i64x2 { - a.eq(b) +pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) } /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxbw))] -pub unsafe fn _mm_cvtepi8_epi16(a: i8x16) -> i16x8 { - simd_shuffle8::<_, ::v64::i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]).as_i16x8() +pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i { + let a = a.as_i8x16(); + let a = simd_shuffle8::<_, ::v64::i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + mem::transmute(simd_cast::<_, i16x8>(a)) } /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxbd))] -pub unsafe fn _mm_cvtepi8_epi32(a: i8x16) -> i32x4 { - simd_shuffle4::<_, ::v32::i8x4>(a, a, [0, 1, 2, 3]).as_i32x4() +pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i { + let a = a.as_i8x16(); + let a = simd_shuffle4::<_, ::v32::i8x4>(a, a, [0, 1, 2, 3]); + mem::transmute(simd_cast::<_, i32x4>(a)) } /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed @@ -302,56 +309,70 @@ pub unsafe fn _mm_cvtepi8_epi32(a: i8x16) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxbq))] -pub unsafe fn _mm_cvtepi8_epi64(a: i8x16) -> i64x2 { - simd_shuffle2::<_, ::v16::i8x2>(a, a, [0, 1]).as_i64x2() +pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i { + let a = a.as_i8x16(); + let a = simd_shuffle2::<_, ::v16::i8x2>(a, a, [0, 1]); + mem::transmute(simd_cast::<_, i64x2>(a)) } /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxwd))] -pub unsafe fn _mm_cvtepi16_epi32(a: i16x8) -> i32x4 { - simd_shuffle4::<_, ::v64::i16x4>(a, a, [0, 1, 2, 3]).as_i32x4() +pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i { + let a = a.as_i16x8(); + let a = simd_shuffle4::<_, ::v64::i16x4>(a, a, [0, 1, 2, 3]); + mem::transmute(simd_cast::<_, i32x4>(a)) } /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxwq))] -pub unsafe fn _mm_cvtepi16_epi64(a: i16x8) -> i64x2 { - simd_shuffle2::<_, ::v32::i16x2>(a, a, [0, 1]).as_i64x2() +pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i { + let a = a.as_i16x8(); + let a = simd_shuffle2::<_, ::v32::i16x2>(a, a, [0, 1]); + mem::transmute(simd_cast::<_, i64x2>(a)) } /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxdq))] -pub unsafe fn _mm_cvtepi32_epi64(a: i32x4) -> i64x2 { - simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]).as_i64x2() +pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i { + let a = a.as_i32x4(); + let a = simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]); + mem::transmute(simd_cast::<_, i64x2>(a)) } /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxbw))] -pub unsafe fn _mm_cvtepu8_epi16(a: u8x16) -> i16x8 { - simd_shuffle8::<_, ::v64::u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]).as_i16x8() +pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i { + let a = a.as_u8x16(); + let a = simd_shuffle8::<_, ::v64::u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + mem::transmute(simd_cast::<_, i16x8>(a)) } /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxbd))] -pub unsafe fn _mm_cvtepu8_epi32(a: u8x16) -> i32x4 { - simd_shuffle4::<_, ::v32::u8x4>(a, a, [0, 1, 2, 3]).as_i32x4() +pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i { + let a = a.as_u8x16(); + let a = simd_shuffle4::<_, ::v32::u8x4>(a, a, [0, 1, 2, 3]); + mem::transmute(simd_cast::<_, i32x4>(a)) } /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxbq))] -pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 { - simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]).as_i64x2() +pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i { + let a = a.as_u8x16(); + let a = simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]); + mem::transmute(simd_cast::<_, i64x2>(a)) } /// Zero extend packed unsigned 16-bit integers in `a` @@ -359,8 +380,10 @@ pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxwd))] -pub unsafe fn _mm_cvtepu16_epi32(a: u16x8) -> i32x4 { - simd_shuffle4::<_, ::v64::u16x4>(a, a, [0, 1, 2, 3]).as_i32x4() +pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i { + let a = a.as_u16x8(); + let a = simd_shuffle4::<_, ::v64::u16x4>(a, a, [0, 1, 2, 3]); + mem::transmute(simd_cast::<_, i32x4>(a)) } /// Zero extend packed unsigned 16-bit integers in `a` @@ -368,8 +391,10 @@ pub unsafe fn _mm_cvtepu16_epi32(a: u16x8) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxwq))] -pub unsafe fn _mm_cvtepu16_epi64(a: u16x8) -> i64x2 { - simd_shuffle2::<_, ::v32::u16x2>(a, a, [0, 1]).as_i64x2() +pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i { + let a = a.as_u16x8(); + let a = simd_shuffle2::<_, ::v32::u16x2>(a, a, [0, 1]); + mem::transmute(simd_cast::<_, i64x2>(a)) } /// Zero extend packed unsigned 32-bit integers in `a` @@ -377,11 +402,13 @@ pub unsafe fn _mm_cvtepu16_epi64(a: u16x8) -> i64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxdq))] -pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 { - simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]).as_i64x2() +pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i { + let a = a.as_u32x4(); + let a = simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]); + mem::transmute(simd_cast::<_, i64x2>(a)) } -/// Returns the dot product of two f64x2 vectors. +/// Returns the dot product of two __m128d vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. /// If a condition mask bit is zero, the corresponding multiplication is @@ -391,14 +418,14 @@ pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(dppd, imm8 = 0))] -pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 { +pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { macro_rules! call { ($imm8:expr) => { dppd(a, b, $imm8) } } constify_imm8!(imm8, call) } -/// Returns the dot product of two f32x4 vectors. +/// Returns the dot product of two __m128 vectors. /// /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask. /// If a condition mask bit is zero, the corresponding multiplication is @@ -408,7 +435,7 @@ pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(dpps, imm8 = 0))] -pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 { +pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { macro_rules! call { ($imm8:expr) => { dpps(a, b, $imm8) } } @@ -421,7 +448,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundpd))] -pub unsafe fn _mm_floor_pd(a: f64x2) -> f64x2 { +pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { roundpd(a, _MM_FROUND_FLOOR) } @@ -431,7 +458,7 @@ pub unsafe fn _mm_floor_pd(a: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundps))] -pub unsafe fn _mm_floor_ps(a: f32x4) -> f32x4 { +pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 { roundps(a, _MM_FROUND_FLOOR) } @@ -443,7 +470,7 @@ pub unsafe fn _mm_floor_ps(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundsd))] -pub unsafe fn _mm_floor_sd(a: f64x2, b: f64x2) -> f64x2 { +pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d { roundsd(a, b, _MM_FROUND_FLOOR) } @@ -455,7 +482,7 @@ pub unsafe fn _mm_floor_sd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundss))] -pub unsafe fn _mm_floor_ss(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { roundss(a, b, _MM_FROUND_FLOOR) } @@ -465,7 +492,7 @@ pub unsafe fn _mm_floor_ss(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundpd))] -pub unsafe fn _mm_ceil_pd(a: f64x2) -> f64x2 { +pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { roundpd(a, _MM_FROUND_CEIL) } @@ -475,7 +502,7 @@ pub unsafe fn _mm_ceil_pd(a: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundps))] -pub unsafe fn _mm_ceil_ps(a: f32x4) -> f32x4 { +pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 { roundps(a, _MM_FROUND_CEIL) } @@ -487,7 +514,7 @@ pub unsafe fn _mm_ceil_ps(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundsd))] -pub unsafe fn _mm_ceil_sd(a: f64x2, b: f64x2) -> f64x2 { +pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d { roundsd(a, b, _MM_FROUND_CEIL) } @@ -499,7 +526,7 @@ pub unsafe fn _mm_ceil_sd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundss))] -pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { roundss(a, b, _MM_FROUND_CEIL) } @@ -525,7 +552,7 @@ pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundpd, rounding = 0))] -pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 { +pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { roundpd(a, $imm4) } } @@ -554,7 +581,7 @@ pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundps, rounding = 0))] -pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 { +pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { roundps(a, $imm4) } } @@ -585,7 +612,7 @@ pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundsd, rounding = 0))] -pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 { +pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { roundsd(a, b, $imm4) } } @@ -616,14 +643,14 @@ pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(roundss, rounding = 0))] -pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 { +pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { roundss(a, b, $imm4) } } constify_imm4!(rounding, call) } -/// Finds the minimum unsigned 16-bit element in the 128-bit u16x8 vector, +/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector, /// returning a vector containing its value in its first position, and its /// index /// in its second position; all other elements are set to zero. @@ -633,7 +660,7 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 { /// /// Arguments: /// -/// * `a` - A 128-bit vector of type `u16x8`. +/// * `a` - A 128-bit vector of type `__m128i`. /// /// Returns: /// @@ -645,8 +672,8 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(phminposuw))] -pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 { - phminposuw(a) +pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { + mem::transmute(phminposuw(a.as_u16x8())) } /// Multiply the low 32-bit integers from each packed 64-bit @@ -654,21 +681,21 @@ pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmuldq))] -pub unsafe fn _mm_mul_epi32(a: i32x4, b: i32x4) -> i64x2 { - pmuldq(a, b) +pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(pmuldq(a.as_i32x4(), b.as_i32x4())) } /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate /// 64-bit integers, and returns the lowest 32-bit, whatever they might be, -/// reinterpreted as a signed integer. While `pmulld i32x4::splat(2), -/// i32x4::splat(2)` returns the obvious `i32x4::splat(4)`, due to wrapping -/// arithmetic `pmulld i32x4::splat(i32::MAX), i32x4::splat(2)` would return a +/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2), +/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping +/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would return a /// negative number. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmulld))] -pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { - a * b +pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i { + mem::transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) } /// Subtracts 8-bit unsigned integer values and computes the absolute @@ -691,8 +718,8 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { /// /// Arguments: /// -/// * `a` - A 128-bit vector of type `i8x16`. -/// * `b` - A 128-bit vector of type `i8x16`. +/// * `a` - A 128-bit vector of type `__m128i`. +/// * `b` - A 128-bit vector of type `__m128i`. /// * `imm8` - An 8-bit immediate operand specifying how the absolute /// differences are to be calculated /// * Bit `[2]` specify the offset for operand `a` @@ -700,16 +727,18 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { /// /// Returns: /// -/// * A `i16x8` vector containing the sums of the sets of +/// * A `__m128i` vector containing the sums of the sets of /// absolute differences between both operands. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))] -pub unsafe fn _mm_mpsadbw_epu8(a: u8x16, b: u8x16, imm8: i32) -> u16x8 { +pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let a = a.as_u8x16(); + let b = b.as_u8x16(); macro_rules! call { ($imm8:expr) => { mpsadbw(a, b, $imm8) } } - constify_imm3!(imm8, call) + mem::transmute(constify_imm3!(imm8, call)) } #[allow(improper_ctypes)] @@ -717,17 +746,17 @@ extern "C" { #[link_name = "llvm.x86.sse41.pblendvb"] fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; #[link_name = "llvm.x86.sse41.blendvpd"] - fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2; + fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; #[link_name = "llvm.x86.sse41.blendvps"] - fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4; + fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128; #[link_name = "llvm.x86.sse41.blendpd"] - fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2; + fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; #[link_name = "llvm.x86.sse41.blendps"] - fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4; + fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128; #[link_name = "llvm.x86.sse41.pblendw"] fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; #[link_name = "llvm.x86.sse41.insertps"] - fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; + fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; #[link_name = "llvm.x86.sse41.pmaxsb"] fn pmaxsb(a: i8x16, b: i8x16) -> i8x16; #[link_name = "llvm.x86.sse41.pmaxuw"] @@ -747,17 +776,17 @@ extern "C" { #[link_name = "llvm.x86.sse41.packusdw"] fn packusdw(a: i32x4, b: i32x4) -> u16x8; #[link_name = "llvm.x86.sse41.dppd"] - fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; + fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d; #[link_name = "llvm.x86.sse41.dpps"] - fn dpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; + fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128; #[link_name = "llvm.x86.sse41.round.pd"] - fn roundpd(a: f64x2, rounding: i32) -> f64x2; + fn roundpd(a: __m128d, rounding: i32) -> __m128d; #[link_name = "llvm.x86.sse41.round.ps"] - fn roundps(a: f32x4, rounding: i32) -> f32x4; + fn roundps(a: __m128, rounding: i32) -> __m128; #[link_name = "llvm.x86.sse41.round.sd"] - fn roundsd(a: f64x2, b: f64x2, rounding: i32) -> f64x2; + fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d; #[link_name = "llvm.x86.sse41.round.ss"] - fn roundss(a: f32x4, b: f32x4, rounding: i32) -> f32x4; + fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; #[link_name = "llvm.x86.sse41.phminposuw"] fn phminposuw(a: u16x8) -> u16x8; #[link_name = "llvm.x86.sse41.pmuldq"] @@ -770,145 +799,144 @@ extern "C" { mod tests { use std::mem; use stdsimd_test::simd_test; - use x86::i586::sse41; - use v128::*; + use x86::*; #[simd_test = "sse4.1"] - unsafe fn _mm_blendv_epi8() { + unsafe fn test_mm_blendv_epi8() { let a = - i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x16::new( + let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); let mask = - i8x16::new(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + _mm_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x16::new( + let e = _mm_setr_epi8( 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, ); - assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e); + assert_eq!(_mm_blendv_epi8(a, b, mask), e); } #[simd_test = "sse4.1"] - unsafe fn _mm_blendv_pd() { - let a = f64x2::splat(0.0); - let b = f64x2::splat(1.0); - let mask = mem::transmute(i64x2::new(0, -1)); - let r = sse41::_mm_blendv_pd(a, b, mask); - let e = f64x2::new(0.0, 1.0); - assert_eq!(r, e); + unsafe fn test_mm_blendv_pd() { + let a = _mm_set1_pd(0.0); + let b = _mm_set1_pd(1.0); + let mask = mem::transmute(_mm_setr_epi64x(0, -1)); + let r = _mm_blendv_pd(a, b, mask); + let e = _mm_setr_pd(0.0, 1.0); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_blendv_ps() { - let a = f32x4::splat(0.0); - let b = f32x4::splat(1.0); - let mask = mem::transmute(i32x4::new(0, -1, 0, -1)); - let r = sse41::_mm_blendv_ps(a, b, mask); - let e = f32x4::new(0.0, 1.0, 0.0, 1.0); - assert_eq!(r, e); + unsafe fn test_mm_blendv_ps() { + let a = _mm_set1_ps(0.0); + let b = _mm_set1_ps(1.0); + let mask = mem::transmute(_mm_setr_epi32(0, -1, 0, -1)); + let r = _mm_blendv_ps(a, b, mask); + let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_blend_pd() { - let a = f64x2::splat(0.0); - let b = f64x2::splat(1.0); - let r = sse41::_mm_blend_pd(a, b, 0b10); - let e = f64x2::new(0.0, 1.0); - assert_eq!(r, e); + unsafe fn test_mm_blend_pd() { + let a = _mm_set1_pd(0.0); + let b = _mm_set1_pd(1.0); + let r = _mm_blend_pd(a, b, 0b10); + let e = _mm_setr_pd(0.0, 1.0); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_blend_ps() { - let a = f32x4::splat(0.0); - let b = f32x4::splat(1.0); - let r = sse41::_mm_blend_ps(a, b, 0b1010); - let e = f32x4::new(0.0, 1.0, 0.0, 1.0); - assert_eq!(r, e); + unsafe fn test_mm_blend_ps() { + let a = _mm_set1_ps(0.0); + let b = _mm_set1_ps(1.0); + let r = _mm_blend_ps(a, b, 0b1010); + let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_blend_epi16() { - let a = i16x8::splat(0); - let b = i16x8::splat(1); - let r = sse41::_mm_blend_epi16(a, b, 0b1010_1100); - let e = i16x8::new(0, 0, 1, 1, 0, 1, 0, 1); + unsafe fn test_mm_blend_epi16() { + let a = _mm_set1_epi16(0); + let b = _mm_set1_epi16(1); + let r = _mm_blend_epi16(a, b, 0b1010_1100); + let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_extract_ps() { - let a = f32x4::new(0.0, 1.0, 2.0, 3.0); - let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1)); + unsafe fn test_mm_extract_ps() { + let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0); + let r: f32 = mem::transmute(_mm_extract_ps(a, 1)); assert_eq!(r, 1.0); - let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5)); + let r: f32 = mem::transmute(_mm_extract_ps(a, 5)); assert_eq!(r, 1.0); } #[simd_test = "sse4.1"] - unsafe fn _mm_extract_epi8() { + unsafe fn test_mm_extract_epi8() { let a = - i8x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r1 = sse41::_mm_extract_epi8(a, 0); - let r2 = sse41::_mm_extract_epi8(a, 19); + _mm_setr_epi8(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r1 = _mm_extract_epi8(a, 0); + let r2 = _mm_extract_epi8(a, 19); assert_eq!(r1, 0xFF); assert_eq!(r2, 3); } #[simd_test = "sse4.1"] - unsafe fn _mm_extract_epi32() { - let a = i32x4::new(0, 1, 2, 3); - let r = sse41::_mm_extract_epi32(a, 1); + unsafe fn test_mm_extract_epi32() { + let a = _mm_setr_epi32(0, 1, 2, 3); + let r = _mm_extract_epi32(a, 1); assert_eq!(r, 1); - let r = sse41::_mm_extract_epi32(a, 5); + let r = _mm_extract_epi32(a, 5); assert_eq!(r, 1); } #[simd_test = "sse4.1"] - unsafe fn _mm_insert_ps() { - let a = f32x4::splat(1.0); - let b = f32x4::new(1.0, 2.0, 3.0, 4.0); - let r = sse41::_mm_insert_ps(a, b, 0b11_00_1100); - let e = f32x4::new(4.0, 1.0, 0.0, 0.0); - assert_eq!(r, e); + unsafe fn test_mm_insert_ps() { + let a = _mm_set1_ps(1.0); + let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let r = _mm_insert_ps(a, b, 0b11_00_1100); + let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_insert_epi8() { - let a = i8x16::splat(0); - let e = i8x16::splat(0).replace(1, 32); - let r = sse41::_mm_insert_epi8(a, 32, 1); + unsafe fn test_mm_insert_epi8() { + let a = _mm_set1_epi8(0); + let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm_insert_epi8(a, 32, 1); assert_eq!(r, e); - let r = sse41::_mm_insert_epi8(a, 32, 17); + let r = _mm_insert_epi8(a, 32, 17); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_insert_epi32() { - let a = i32x4::splat(0); - let e = i32x4::splat(0).replace(1, 32); - let r = sse41::_mm_insert_epi32(a, 32, 1); + unsafe fn test_mm_insert_epi32() { + let a = _mm_set1_epi32(0); + let e = _mm_setr_epi32(0, 32, 0, 0); + let r = _mm_insert_epi32(a, 32, 1); assert_eq!(r, e); - let r = sse41::_mm_insert_epi32(a, 32, 5); + let r = _mm_insert_epi32(a, 32, 5); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_max_epi8() { + unsafe fn test_mm_max_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x16::new( + let a = _mm_setr_epi8( 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x16::new( + let b = _mm_setr_epi8( 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, ); - let r = sse41::_mm_max_epi8(a, b); + let r = _mm_max_epi8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x16::new( + let e = _mm_setr_epi8( 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, ); @@ -916,47 +944,47 @@ mod tests { } #[simd_test = "sse4.1"] - unsafe fn _mm_max_epu16() { - let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16); - let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15); - let r = sse41::_mm_max_epu16(a, b); - let e = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + unsafe fn test_mm_max_epu16() { + let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm_max_epu16(a, b); + let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_max_epi32() { - let a = i32x4::new(1, 4, 5, 8); - let b = i32x4::new(2, 3, 6, 7); - let r = sse41::_mm_max_epi32(a, b); - let e = i32x4::new(2, 4, 6, 8); + unsafe fn test_mm_max_epi32() { + let a = _mm_setr_epi32(1, 4, 5, 8); + let b = _mm_setr_epi32(2, 3, 6, 7); + let r = _mm_max_epi32(a, b); + let e = _mm_setr_epi32(2, 4, 6, 8); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_max_epu32() { - let a = u32x4::new(1, 4, 5, 8); - let b = u32x4::new(2, 3, 6, 7); - let r = sse41::_mm_max_epu32(a, b); - let e = u32x4::new(2, 4, 6, 8); + unsafe fn test_mm_max_epu32() { + let a = _mm_setr_epi32(1, 4, 5, 8); + let b = _mm_setr_epi32(2, 3, 6, 7); + let r = _mm_max_epu32(a, b); + let e = _mm_setr_epi32(2, 4, 6, 8); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_min_epi8_1() { + unsafe fn test_mm_min_epi8_1() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x16::new( + let a = _mm_setr_epi8( 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x16::new( + let b = _mm_setr_epi8( 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, ); - let r = sse41::_mm_min_epi8(a, b); + let r = _mm_min_epi8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x16::new( + let e = _mm_setr_epi8( 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, ); @@ -964,20 +992,20 @@ mod tests { } #[simd_test = "sse4.1"] - unsafe fn _mm_min_epi8_2() { + unsafe fn test_mm_min_epi8_2() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x16::new( + let a = _mm_setr_epi8( 1, -4, -5, 8, -9, -12, 13, -16, 17, 20, 21, 24, 25, 28, 29, 32, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x16::new( + let b = _mm_setr_epi8( 2, -3, -6, 7, -10, -11, 14, -15, 18, 19, 22, 23, 26, 27, 30, 31, ); - let r = sse41::_mm_min_epi8(a, b); + let r = _mm_min_epi8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x16::new( + let e = _mm_setr_epi8( 1, -4, -6, 7, -10, -12, 13, -16, 17, 19, 21, 23, 25, 27, 29, 31, ); @@ -985,401 +1013,399 @@ mod tests { } #[simd_test = "sse4.1"] - unsafe fn _mm_min_epu16() { - let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16); - let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15); - let r = sse41::_mm_min_epu16(a, b); - let e = u16x8::new(1, 3, 5, 7, 9, 11, 13, 15); + unsafe fn test_mm_min_epu16() { + let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm_min_epu16(a, b); + let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_min_epi32_1() { - let a = i32x4::new(1, 4, 5, 8); - let b = i32x4::new(2, 3, 6, 7); - let r = sse41::_mm_min_epi32(a, b); - let e = i32x4::new(1, 3, 5, 7); + unsafe fn test_mm_min_epi32_1() { + let a = _mm_setr_epi32(1, 4, 5, 8); + let b = _mm_setr_epi32(2, 3, 6, 7); + let r = _mm_min_epi32(a, b); + let e = _mm_setr_epi32(1, 3, 5, 7); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_min_epi32_2() { - let a = i32x4::new(-1, 4, 5, -7); - let b = i32x4::new(-2, 3, -6, 8); - let r = sse41::_mm_min_epi32(a, b); - let e = i32x4::new(-2, 3, -6, -7); + unsafe fn test_mm_min_epi32_2() { + let a = _mm_setr_epi32(-1, 4, 5, -7); + let b = _mm_setr_epi32(-2, 3, -6, 8); + let r = _mm_min_epi32(a, b); + let e = _mm_setr_epi32(-2, 3, -6, -7); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_min_epu32() { - let a = u32x4::new(1, 4, 5, 8); - let b = u32x4::new(2, 3, 6, 7); - let r = sse41::_mm_min_epu32(a, b); - let e = u32x4::new(1, 3, 5, 7); + unsafe fn test_mm_min_epu32() { + let a = _mm_setr_epi32(1, 4, 5, 8); + let b = _mm_setr_epi32(2, 3, 6, 7); + let r = _mm_min_epu32(a, b); + let e = _mm_setr_epi32(1, 3, 5, 7); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_packus_epi32() { - let a = i32x4::new(1, 2, 3, 4); - let b = i32x4::new(-1, -2, -3, -4); - let r = sse41::_mm_packus_epi32(a, b); - let e = u16x8::new(1, 2, 3, 4, 0, 0, 0, 0); + unsafe fn test_mm_packus_epi32() { + let a = _mm_setr_epi32(1, 2, 3, 4); + let b = _mm_setr_epi32(-1, -2, -3, -4); + let r = _mm_packus_epi32(a, b); + let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cmpeq_epi64() { - let a = i64x2::new(0, 1); - let b = i64x2::new(0, 0); - let r = sse41::_mm_cmpeq_epi64(a, b); - let e = i64x2::new(-1, 0); + unsafe fn test_mm_cmpeq_epi64() { + let a = _mm_setr_epi64x(0, 1); + let b = _mm_setr_epi64x(0, 0); + let r = _mm_cmpeq_epi64(a, b); + let e = _mm_setr_epi64x(-1, 0); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepi8_epi16() { - let a = i8x16::splat(10); - let r = sse41::_mm_cvtepi8_epi16(a); - let e = i16x8::splat(10); + unsafe fn test_mm_cvtepi8_epi16() { + let a = _mm_set1_epi8(10); + let r = _mm_cvtepi8_epi16(a); + let e = _mm_set1_epi16(10); assert_eq!(r, e); - let a = i8x16::splat(-10); - let r = sse41::_mm_cvtepi8_epi16(a); - let e = i16x8::splat(-10); + let a = _mm_set1_epi8(-10); + let r = _mm_cvtepi8_epi16(a); + let e = _mm_set1_epi16(-10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepi8_epi32() { - let a = i8x16::splat(10); - let r = sse41::_mm_cvtepi8_epi32(a); - let e = i32x4::splat(10); + unsafe fn test_mm_cvtepi8_epi32() { + let a = _mm_set1_epi8(10); + let r = _mm_cvtepi8_epi32(a); + let e = _mm_set1_epi32(10); assert_eq!(r, e); - let a = i8x16::splat(-10); - let r = sse41::_mm_cvtepi8_epi32(a); - let e = i32x4::splat(-10); + let a = _mm_set1_epi8(-10); + let r = _mm_cvtepi8_epi32(a); + let e = _mm_set1_epi32(-10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepi8_epi64() { - let a = i8x16::splat(10); - let r = sse41::_mm_cvtepi8_epi64(a); - let e = i64x2::splat(10); + unsafe fn test_mm_cvtepi8_epi64() { + let a = _mm_set1_epi8(10); + let r = _mm_cvtepi8_epi64(a); + let e = _mm_set1_epi64x(10); assert_eq!(r, e); - let a = i8x16::splat(-10); - let r = sse41::_mm_cvtepi8_epi64(a); - let e = i64x2::splat(-10); + let a = _mm_set1_epi8(-10); + let r = _mm_cvtepi8_epi64(a); + let e = _mm_set1_epi64x(-10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepi16_epi32() { - let a = i16x8::splat(10); - let r = sse41::_mm_cvtepi16_epi32(a); - let e = i32x4::splat(10); + unsafe fn test_mm_cvtepi16_epi32() { + let a = _mm_set1_epi16(10); + let r = _mm_cvtepi16_epi32(a); + let e = _mm_set1_epi32(10); assert_eq!(r, e); - let a = i16x8::splat(-10); - let r = sse41::_mm_cvtepi16_epi32(a); - let e = i32x4::splat(-10); + let a = _mm_set1_epi16(-10); + let r = _mm_cvtepi16_epi32(a); + let e = _mm_set1_epi32(-10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepi16_epi64() { - let a = i16x8::splat(10); - let r = sse41::_mm_cvtepi16_epi64(a); - let e = i64x2::splat(10); + unsafe fn test_mm_cvtepi16_epi64() { + let a = _mm_set1_epi16(10); + let r = _mm_cvtepi16_epi64(a); + let e = _mm_set1_epi64x(10); assert_eq!(r, e); - let a = i16x8::splat(-10); - let r = sse41::_mm_cvtepi16_epi64(a); - let e = i64x2::splat(-10); + let a = _mm_set1_epi16(-10); + let r = _mm_cvtepi16_epi64(a); + let e = _mm_set1_epi64x(-10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepi32_epi64() { - let a = i32x4::splat(10); - let r = sse41::_mm_cvtepi32_epi64(a); - let e = i64x2::splat(10); + unsafe fn test_mm_cvtepi32_epi64() { + let a = _mm_set1_epi32(10); + let r = _mm_cvtepi32_epi64(a); + let e = _mm_set1_epi64x(10); assert_eq!(r, e); - let a = i32x4::splat(-10); - let r = sse41::_mm_cvtepi32_epi64(a); - let e = i64x2::splat(-10); + let a = _mm_set1_epi32(-10); + let r = _mm_cvtepi32_epi64(a); + let e = _mm_set1_epi64x(-10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepu8_epi16() { - let a = u8x16::splat(10); - let r = sse41::_mm_cvtepu8_epi16(a); - let e = i16x8::splat(10); + unsafe fn test_mm_cvtepu8_epi16() { + let a = _mm_set1_epi8(10); + let r = _mm_cvtepu8_epi16(a); + let e = _mm_set1_epi16(10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepu8_epi32() { - let a = u8x16::splat(10); - let r = sse41::_mm_cvtepu8_epi32(a); - let e = i32x4::splat(10); + unsafe fn test_mm_cvtepu8_epi32() { + let a = _mm_set1_epi8(10); + let r = _mm_cvtepu8_epi32(a); + let e = _mm_set1_epi32(10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepu8_epi64() { - let a = u8x16::splat(10); - let r = sse41::_mm_cvtepu8_epi64(a); - let e = i64x2::splat(10); + unsafe fn test_mm_cvtepu8_epi64() { + let a = _mm_set1_epi8(10); + let r = _mm_cvtepu8_epi64(a); + let e = _mm_set1_epi64x(10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepu16_epi32() { - let a = u16x8::splat(10); - let r = sse41::_mm_cvtepu16_epi32(a); - let e = i32x4::splat(10); + unsafe fn test_mm_cvtepu16_epi32() { + let a = _mm_set1_epi16(10); + let r = _mm_cvtepu16_epi32(a); + let e = _mm_set1_epi32(10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepu16_epi64() { - let a = u16x8::splat(10); - let r = sse41::_mm_cvtepu16_epi64(a); - let e = i64x2::splat(10); + unsafe fn test_mm_cvtepu16_epi64() { + let a = _mm_set1_epi16(10); + let r = _mm_cvtepu16_epi64(a); + let e = _mm_set1_epi64x(10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_cvtepu32_epi64() { - let a = u32x4::splat(10); - let r = sse41::_mm_cvtepu32_epi64(a); - let e = i64x2::splat(10); + unsafe fn test_mm_cvtepu32_epi64() { + let a = _mm_set1_epi32(10); + let r = _mm_cvtepu32_epi64(a); + let e = _mm_set1_epi64x(10); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_dp_pd() { - let a = f64x2::new(2.0, 3.0); - let b = f64x2::new(1.0, 4.0); - let e = f64x2::new(14.0, 0.0); - assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e); + unsafe fn test_mm_dp_pd() { + let a = _mm_setr_pd(2.0, 3.0); + let b = _mm_setr_pd(1.0, 4.0); + let e = _mm_setr_pd(14.0, 0.0); + assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e); } #[simd_test = "sse4.1"] - unsafe fn _mm_dp_ps() { - let a = f32x4::new(2.0, 3.0, 1.0, 10.0); - let b = f32x4::new(1.0, 4.0, 0.5, 10.0); - let e = f32x4::new(14.5, 0.0, 14.5, 0.0); - assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e); + unsafe fn test_mm_dp_ps() { + let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0); + let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0); + let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0); + assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e); } #[simd_test = "sse4.1"] - unsafe fn _mm_floor_pd() { - let a = f64x2::new(2.5, 4.5); - let r = sse41::_mm_floor_pd(a); - let e = f64x2::new(2.0, 4.0); - assert_eq!(r, e); + unsafe fn test_mm_floor_pd() { + let a = _mm_setr_pd(2.5, 4.5); + let r = _mm_floor_pd(a); + let e = _mm_setr_pd(2.0, 4.0); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_floor_ps() { - let a = f32x4::new(2.5, 4.5, 8.5, 16.5); - let r = sse41::_mm_floor_ps(a); - let e = f32x4::new(2.0, 4.0, 8.0, 16.0); - assert_eq!(r, e); + unsafe fn test_mm_floor_ps() { + let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); + let r = _mm_floor_ps(a); + let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_floor_sd() { - let a = f64x2::new(2.5, 4.5); - let b = f64x2::new(-1.5, -3.5); - let r = sse41::_mm_floor_sd(a, b); - let e = f64x2::new(-2.0, 4.5); - assert_eq!(r, e); + unsafe fn test_mm_floor_sd() { + let a = _mm_setr_pd(2.5, 4.5); + let b = _mm_setr_pd(-1.5, -3.5); + let r = _mm_floor_sd(a, b); + let e = _mm_setr_pd(-2.0, 4.5); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_floor_ss() { - let a = f32x4::new(2.5, 4.5, 8.5, 16.5); - let b = f32x4::new(-1.5, -3.5, -7.5, -15.5); - let r = sse41::_mm_floor_ss(a, b); - let e = f32x4::new(-2.0, 4.5, 8.5, 16.5); - assert_eq!(r, e); + unsafe fn test_mm_floor_ss() { + let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); + let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5); + let r = _mm_floor_ss(a, b); + let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_ceil_pd() { - let a = f64x2::new(1.5, 3.5); - let r = sse41::_mm_ceil_pd(a); - let e = f64x2::new(2.0, 4.0); - assert_eq!(r, e); + unsafe fn test_mm_ceil_pd() { + let a = _mm_setr_pd(1.5, 3.5); + let r = _mm_ceil_pd(a); + let e = _mm_setr_pd(2.0, 4.0); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_ceil_ps() { - let a = f32x4::new(1.5, 3.5, 7.5, 15.5); - let r = sse41::_mm_ceil_ps(a); - let e = f32x4::new(2.0, 4.0, 8.0, 16.0); - assert_eq!(r, e); + unsafe fn test_mm_ceil_ps() { + let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); + let r = _mm_ceil_ps(a); + let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_ceil_sd() { - let a = f64x2::new(1.5, 3.5); - let b = f64x2::new(-2.5, -4.5); - let r = sse41::_mm_ceil_sd(a, b); - let e = f64x2::new(-2.0, 3.5); - assert_eq!(r, e); + unsafe fn test_mm_ceil_sd() { + let a = _mm_setr_pd(1.5, 3.5); + let b = _mm_setr_pd(-2.5, -4.5); + let r = _mm_ceil_sd(a, b); + let e = _mm_setr_pd(-2.0, 3.5); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_ceil_ss() { - let a = f32x4::new(1.5, 3.5, 7.5, 15.5); - let b = f32x4::new(-2.5, -4.5, -8.5, -16.5); - let r = sse41::_mm_ceil_ss(a, b); - let e = f32x4::new(-2.0, 3.5, 7.5, 15.5); - assert_eq!(r, e); + unsafe fn test_mm_ceil_ss() { + let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); + let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5); + let r = _mm_ceil_ss(a, b); + let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_round_pd() { - let a = f64x2::new(1.25, 3.75); - let r = sse41::_mm_round_pd(a, sse41::_MM_FROUND_TO_NEAREST_INT); - let e = f64x2::new(1.0, 4.0); - assert_eq!(r, e); + unsafe fn test_mm_round_pd() { + let a = _mm_setr_pd(1.25, 3.75); + let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); + let e = _mm_setr_pd(1.0, 4.0); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_round_ps() { - let a = f32x4::new(2.25, 4.75, -1.75, -4.25); - let r = sse41::_mm_round_ps(a, sse41::_MM_FROUND_TO_ZERO); - let e = f32x4::new(2.0, 4.0, -1.0, -4.0); - assert_eq!(r, e); + unsafe fn test_mm_round_ps() { + let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25); + let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO); + let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_round_sd() { - use x86::i586::sse; - let a = f64x2::new(1.5, 3.5); - let b = f64x2::new(-2.5, -4.5); - let old_mode = sse::_MM_GET_ROUNDING_MODE(); - sse::_MM_SET_ROUNDING_MODE(sse::_MM_ROUND_TOWARD_ZERO); - let r = sse41::_mm_round_sd(a, b, sse41::_MM_FROUND_CUR_DIRECTION); - sse::_MM_SET_ROUNDING_MODE(old_mode); - let e = f64x2::new(-2.0, 3.5); - assert_eq!(r, e); + unsafe fn test_mm_round_sd() { + let a = _mm_setr_pd(1.5, 3.5); + let b = _mm_setr_pd(-2.5, -4.5); + let old_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + _MM_SET_ROUNDING_MODE(old_mode); + let e = _mm_setr_pd(-2.0, 3.5); + assert_eq_m128d(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_round_ss() { - use x86::i586::sse; - let a = f32x4::new(1.5, 3.5, 7.5, 15.5); - let b = f32x4::new(-1.75, -4.5, -8.5, -16.5); - let old_mode = sse::_MM_GET_ROUNDING_MODE(); - sse::_MM_SET_ROUNDING_MODE(sse::_MM_ROUND_NEAREST); - let r = sse41::_mm_round_ss(a, b, sse41::_MM_FROUND_CUR_DIRECTION); - sse::_MM_SET_ROUNDING_MODE(old_mode); - let e = f32x4::new(-2.0, 3.5, 7.5, 15.5); - assert_eq!(r, e); + unsafe fn test_mm_round_ss() { + let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); + let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); + let old_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + _MM_SET_ROUNDING_MODE(old_mode); + let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); + assert_eq_m128(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_minpos_epu16_1() { - let a = u16x8::new(23, 18, 44, 97, 50, 13, 67, 66); - let r = sse41::_mm_minpos_epu16(a); - let e = u16x8::new(13, 5, 0, 0, 0, 0, 0, 0); + unsafe fn test_mm_minpos_epu16_1() { + let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66); + let r = _mm_minpos_epu16(a); + let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_minpos_epu16_2() { - let a = u16x8::new(0, 18, 44, 97, 50, 13, 67, 66); - let r = sse41::_mm_minpos_epu16(a); - let e = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0); + unsafe fn test_mm_minpos_epu16_2() { + let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66); + let r = _mm_minpos_epu16(a); + let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_mul_epi32() { + unsafe fn test_mm_mul_epi32() { { - let a = i32x4::new(1, 1, 1, 1); - let b = i32x4::new(1, 2, 3, 4); - let r = sse41::_mm_mul_epi32(a, b); - let e = i64x2::new(1, 3); + let a = _mm_setr_epi32(1, 1, 1, 1); + let b = _mm_setr_epi32(1, 2, 3, 4); + let r = _mm_mul_epi32(a, b); + let e = _mm_setr_epi64x(1, 3); assert_eq!(r, e); } { - let a = i32x4::new( + let a = _mm_setr_epi32( 15, 2, /* ignored */ 1234567, 4, /* ignored */ ); - let b = i32x4::new( + let b = _mm_setr_epi32( -20, -256, /* ignored */ 666666, 666666, /* ignored */ ); - let r = sse41::_mm_mul_epi32(a, b); - let e = i64x2::new(-300, 823043843622); + let r = _mm_mul_epi32(a, b); + let e = _mm_setr_epi64x(-300, 823043843622); assert_eq!(r, e); } } #[simd_test = "sse4.1"] - unsafe fn _mm_mullo_epi32() { + unsafe fn test_mm_mullo_epi32() { { - let a = i32x4::new(1, 1, 1, 1); - let b = i32x4::new(1, 2, 3, 4); - let r = sse41::_mm_mullo_epi32(a, b); - let e = i32x4::new(1, 2, 3, 4); + let a = _mm_setr_epi32(1, 1, 1, 1); + let b = _mm_setr_epi32(1, 2, 3, 4); + let r = _mm_mullo_epi32(a, b); + let e = _mm_setr_epi32(1, 2, 3, 4); assert_eq!(r, e); } { - let a = i32x4::new(15, -2, 1234567, 99999); - let b = i32x4::new(-20, -256, 666666, -99999); - let r = sse41::_mm_mullo_epi32(a, b); + let a = _mm_setr_epi32(15, -2, 1234567, 99999); + let b = _mm_setr_epi32(-20, -256, 666666, -99999); + let r = _mm_mullo_epi32(a, b); // Attention, most significant bit in r[2] is treated // as a sign bit: // 1234567 * 666666 = -1589877210 - let e = i32x4::new(-300, 512, -1589877210, -1409865409); + let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409); assert_eq!(r, e); } } #[simd_test = "sse4.1"] - unsafe fn _mm_minpos_epu16() { - let a = u16x8::new(8, 7, 6, 5, 4, 1, 2, 3); - let r = sse41::_mm_minpos_epu16(a); - let e = u16x8::splat(0).replace(0, 1).replace(1, 5); + unsafe fn test_mm_minpos_epu16() { + let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3); + let r = _mm_minpos_epu16(a); + let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0); assert_eq!(r, e); } #[simd_test = "sse4.1"] - unsafe fn _mm_mpsadbw_epu8() { + unsafe fn test_mm_mpsadbw_epu8() { let a = - u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000); - let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28); + let r = _mm_mpsadbw_epu8(a, a, 0b000); + let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); assert_eq!(r, e); - let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001); - let e = u16x8::new(16, 12, 8, 4, 0, 4, 8, 12); + let r = _mm_mpsadbw_epu8(a, a, 0b001); + let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12); assert_eq!(r, e); - let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100); - let e = u16x8::new(16, 20, 24, 28, 32, 36, 40, 44); + let r = _mm_mpsadbw_epu8(a, a, 0b100); + let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44); assert_eq!(r, e); - let r = sse41::_mm_mpsadbw_epu8(a, a, 0b101); - let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28); + let r = _mm_mpsadbw_epu8(a, a, 0b101); + let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); assert_eq!(r, e); - let r = sse41::_mm_mpsadbw_epu8(a, a, 0b111); - let e = u16x8::new(32, 28, 24, 20, 16, 12, 8, 4); + let r = _mm_mpsadbw_epu8(a, a, 0b111); + let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4); assert_eq!(r, e); } } diff --git a/coresimd/src/x86/test.rs b/coresimd/src/x86/test.rs index a4a6e358b5297..f5614861b96ad 100644 --- a/coresimd/src/x86/test.rs +++ b/coresimd/src/x86/test.rs @@ -30,3 +30,10 @@ pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 { union A { a: __m128, b: [f32; 4] }; mem::transmute::<__m128, A>(a).b[idx] } + +// not actually an intrinsic but useful in various tests as we proted from +// `i64x2::new` which is backwards from `_mm_set_epi64x` +#[target_feature = "+sse2"] +pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { + _mm_set_epi64x(b, a) +}