From 03cb92ddce074a5170ed5e5c5c20e5fa4e4846c3 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 11 Jan 2018 23:18:15 -0600
Subject: [PATCH] Migrate i586::sse41 to vendor types (#276)

---
 coresimd/src/x86/i586/sse2.rs  |   6 -
 coresimd/src/x86/i586/sse41.rs | 878 +++++++++++++++++----------------
 coresimd/src/x86/test.rs       |   7 +
 3 files changed, 459 insertions(+), 432 deletions(-)

diff --git a/coresimd/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs
index 22da044b32205..64f4d568c4af2 100644
--- a/coresimd/src/x86/i586/sse2.rs
+++ b/coresimd/src/x86/i586/sse2.rs
@@ -2333,12 +2333,6 @@ mod tests {
     use x86::*;
 	use v128::*;
 
-    // not actually an intrinsics in SSE2 but useful in the tests below
-    #[target_feature = "+sse2"]
-    unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
-        _mm_set_epi64x(b, a)
-    }
-
     #[simd_test = "sse2"]
     unsafe fn test_mm_pause() {
         _mm_pause();
diff --git a/coresimd/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs
index 60f972f4d08ab..b5c50fbb65375 100644
--- a/coresimd/src/x86/i586/sse41.rs
+++ b/coresimd/src/x86/i586/sse41.rs
@@ -4,9 +4,10 @@ use core::mem;
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
-use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8};
 
+use simd_llvm::*;
 use v128::*;
+use x86::*;
 
 // SSE4 rounding constans
 /// round to nearest
@@ -49,8 +50,8 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendvb))]
-pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
-    pblendvb(a, b, mask)
+pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
+    mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
@@ -61,11 +62,13 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
-pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: i32) -> i16x8 {
+pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
     macro_rules! call {
         ($imm8:expr) => { pblendw(a, b, $imm8) }
     }
-    constify_imm8!(imm8, call)
+    mem::transmute(constify_imm8!(imm8, call))
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -73,7 +76,7 @@ pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: i32) -> i16x8 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(blendvpd))]
-pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 {
+pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
     blendvpd(a, b, mask)
 }
 
@@ -82,7 +85,7 @@ pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(blendvps))]
-pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
+pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
     blendvps(a, b, mask)
 }
 
@@ -91,7 +94,7 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
-pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: i32) -> f64x2 {
+pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
     macro_rules! call {
         ($imm2:expr) => { blendpd(a, b, $imm2) }
     }
@@ -103,7 +106,7 @@ pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
-pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: i32) -> f32x4 {
+pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => { blendps(a, b, $imm4) }
     }
@@ -116,8 +119,8 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: i32) -> f32x4 {
 #[target_feature = "+sse4.1"]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm8: i32) -> i32 {
-    mem::transmute(a.extract(imm8 as u32 & 0b11))
+pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
+    mem::transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
 }
 
 /// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
@@ -127,9 +130,9 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: i32) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
-pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: i32) -> i32 {
+pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
     let imm8 = (imm8 & 15) as u32;
-    (a.extract_unchecked(imm8) as i32) & 0xFF
+    simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
@@ -137,9 +140,9 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: i32) -> i32 {
 #[target_feature = "+sse4.1"]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))]
-pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: i32) -> i32 {
+pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
     let imm8 = (imm8 & 3) as u32;
-    a.extract_unchecked(imm8) as i32
+    simd_extract::<_, i32>(a.as_i32x4(), imm8)
 }
 
 /// Select a single value in `a` to store at some position in `b`,
@@ -167,7 +170,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: i32) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
-pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
+pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
     macro_rules! call {
         ($imm8:expr) => { insertps(a, b, $imm8) }
     }
@@ -179,8 +182,8 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
-pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: i32) -> i8x16 {
-    a.replace((imm8 & 0b1111) as u32, i)
+pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i {
+    mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i))
 }
 
 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
@@ -188,8 +191,8 @@ pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: i32) -> i8x16 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
-pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: i32) -> i32x4 {
-    a.replace((imm8 & 0b11) as u32, i)
+pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
+    mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
@@ -197,8 +200,8 @@ pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: i32) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmaxsb))]
-pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    pmaxsb(a, b)
+pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
@@ -206,8 +209,8 @@ pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmaxuw))]
-pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
-    pmaxuw(a, b)
+pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
@@ -215,8 +218,8 @@ pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmaxsd))]
-pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    pmaxsd(a, b)
+pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
@@ -224,8 +227,8 @@ pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmaxud))]
-pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 {
-    pmaxud(a, b)
+pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed minimum
@@ -233,8 +236,8 @@ pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pminsb))]
-pub unsafe fn _mm_min_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    pminsb(a, b)
+pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
@@ -242,8 +245,8 @@ pub unsafe fn _mm_min_epi8(a: i8x16, b: i8x16) -> i8x16 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pminuw))]
-pub unsafe fn _mm_min_epu16(a: u16x8, b: u16x8) -> u16x8 {
-    pminuw(a, b)
+pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum
@@ -251,8 +254,8 @@ pub unsafe fn _mm_min_epu16(a: u16x8, b: u16x8) -> u16x8 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pminsd))]
-pub unsafe fn _mm_min_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    pminsd(a, b)
+pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
@@ -260,8 +263,8 @@ pub unsafe fn _mm_min_epi32(a: i32x4, b: i32x4) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pminud))]
-pub unsafe fn _mm_min_epu32(a: u32x4, b: u32x4) -> u32x4 {
-    pminud(a, b)
+pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pminud(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
@@ -269,32 +272,36 @@ pub unsafe fn _mm_min_epu32(a: u32x4, b: u32x4) -> u32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(packusdw))]
-pub unsafe fn _mm_packus_epi32(a: i32x4, b: i32x4) -> u16x8 {
-    packusdw(a, b)
+pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pcmpeqq))]
-pub unsafe fn _mm_cmpeq_epi64(a: i64x2, b: i64x2) -> i64x2 {
-    a.eq(b)
+pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovsxbw))]
-pub unsafe fn _mm_cvtepi8_epi16(a: i8x16) -> i16x8 {
-    simd_shuffle8::<_, ::v64::i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]).as_i16x8()
+pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a = simd_shuffle8::<_, ::v64::i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    mem::transmute(simd_cast::<_, i16x8>(a))
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovsxbd))]
-pub unsafe fn _mm_cvtepi8_epi32(a: i8x16) -> i32x4 {
-    simd_shuffle4::<_, ::v32::i8x4>(a, a, [0, 1, 2, 3]).as_i32x4()
+pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a = simd_shuffle4::<_, ::v32::i8x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
 }
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
@@ -302,56 +309,70 @@ pub unsafe fn _mm_cvtepi8_epi32(a: i8x16) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovsxbq))]
-pub unsafe fn _mm_cvtepi8_epi64(a: i8x16) -> i64x2 {
-    simd_shuffle2::<_, ::v16::i8x2>(a, a, [0, 1]).as_i64x2()
+pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i8x16();
+    let a = simd_shuffle2::<_, ::v16::i8x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovsxwd))]
-pub unsafe fn _mm_cvtepi16_epi32(a: i16x8) -> i32x4 {
-    simd_shuffle4::<_, ::v64::i16x4>(a, a, [0, 1, 2, 3]).as_i32x4()
+pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let a = simd_shuffle4::<_, ::v64::i16x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovsxwq))]
-pub unsafe fn _mm_cvtepi16_epi64(a: i16x8) -> i64x2 {
-    simd_shuffle2::<_, ::v32::i16x2>(a, a, [0, 1]).as_i64x2()
+pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i16x8();
+    let a = simd_shuffle2::<_, ::v32::i16x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
 }
 
 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovsxdq))]
-pub unsafe fn _mm_cvtepi32_epi64(a: i32x4) -> i64x2 {
-    simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]).as_i64x2()
+pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i32x4();
+    let a = simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovzxbw))]
-pub unsafe fn _mm_cvtepu8_epi16(a: u8x16) -> i16x8 {
-    simd_shuffle8::<_, ::v64::u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]).as_i16x8()
+pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a = simd_shuffle8::<_, ::v64::u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    mem::transmute(simd_cast::<_, i16x8>(a))
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovzxbd))]
-pub unsafe fn _mm_cvtepu8_epi32(a: u8x16) -> i32x4 {
-    simd_shuffle4::<_, ::v32::u8x4>(a, a, [0, 1, 2, 3]).as_i32x4()
+pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a = simd_shuffle4::<_, ::v32::u8x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovzxbq))]
-pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 {
-    simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]).as_i64x2()
+pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u8x16();
+    let a = simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
 }
 
 /// Zero extend packed unsigned 16-bit integers in `a`
@@ -359,8 +380,10 @@ pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovzxwd))]
-pub unsafe fn _mm_cvtepu16_epi32(a: u16x8) -> i32x4 {
-    simd_shuffle4::<_, ::v64::u16x4>(a, a, [0, 1, 2, 3]).as_i32x4()
+pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
+    let a = a.as_u16x8();
+    let a = simd_shuffle4::<_, ::v64::u16x4>(a, a, [0, 1, 2, 3]);
+    mem::transmute(simd_cast::<_, i32x4>(a))
 }
 
 /// Zero extend packed unsigned 16-bit integers in `a`
@@ -368,8 +391,10 @@ pub unsafe fn _mm_cvtepu16_epi32(a: u16x8) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovzxwq))]
-pub unsafe fn _mm_cvtepu16_epi64(a: u16x8) -> i64x2 {
-    simd_shuffle2::<_, ::v32::u16x2>(a, a, [0, 1]).as_i64x2()
+pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u16x8();
+    let a = simd_shuffle2::<_, ::v32::u16x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
 }
 
 /// Zero extend packed unsigned 32-bit integers in `a`
@@ -377,11 +402,13 @@ pub unsafe fn _mm_cvtepu16_epi64(a: u16x8) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmovzxdq))]
-pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 {
-    simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]).as_i64x2()
+pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
+    let a = a.as_u32x4();
+    let a = simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]);
+    mem::transmute(simd_cast::<_, i64x2>(a))
 }
 
-/// Returns the dot product of two f64x2 vectors.
+/// Returns the dot product of two __m128d vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
 /// If a condition mask bit is zero, the corresponding multiplication is
@@ -391,14 +418,14 @@ pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
-pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
+pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
     macro_rules! call {
         ($imm8:expr) => { dppd(a, b, $imm8) }
     }
     constify_imm8!(imm8, call)
 }
 
-/// Returns the dot product of two f32x4 vectors.
+/// Returns the dot product of two __m128 vectors.
 ///
 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
 /// If a condition mask bit is zero, the corresponding multiplication is
@@ -408,7 +435,7 @@ pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
-pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
+pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
     macro_rules! call {
         ($imm8:expr) => { dpps(a, b, $imm8) }
     }
@@ -421,7 +448,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundpd))]
-pub unsafe fn _mm_floor_pd(a: f64x2) -> f64x2 {
+pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
     roundpd(a, _MM_FROUND_FLOOR)
 }
 
@@ -431,7 +458,7 @@ pub unsafe fn _mm_floor_pd(a: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundps))]
-pub unsafe fn _mm_floor_ps(a: f32x4) -> f32x4 {
+pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
     roundps(a, _MM_FROUND_FLOOR)
 }
 
@@ -443,7 +470,7 @@ pub unsafe fn _mm_floor_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundsd))]
-pub unsafe fn _mm_floor_sd(a: f64x2, b: f64x2) -> f64x2 {
+pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
     roundsd(a, b, _MM_FROUND_FLOOR)
 }
 
@@ -455,7 +482,7 @@ pub unsafe fn _mm_floor_sd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundss))]
-pub unsafe fn _mm_floor_ss(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
     roundss(a, b, _MM_FROUND_FLOOR)
 }
 
@@ -465,7 +492,7 @@ pub unsafe fn _mm_floor_ss(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundpd))]
-pub unsafe fn _mm_ceil_pd(a: f64x2) -> f64x2 {
+pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
     roundpd(a, _MM_FROUND_CEIL)
 }
 
@@ -475,7 +502,7 @@ pub unsafe fn _mm_ceil_pd(a: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundps))]
-pub unsafe fn _mm_ceil_ps(a: f32x4) -> f32x4 {
+pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
     roundps(a, _MM_FROUND_CEIL)
 }
 
@@ -487,7 +514,7 @@ pub unsafe fn _mm_ceil_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundsd))]
-pub unsafe fn _mm_ceil_sd(a: f64x2, b: f64x2) -> f64x2 {
+pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
     roundsd(a, b, _MM_FROUND_CEIL)
 }
 
@@ -499,7 +526,7 @@ pub unsafe fn _mm_ceil_sd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundss))]
-pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
     roundss(a, b, _MM_FROUND_CEIL)
 }
 
@@ -525,7 +552,7 @@ pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
-pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 {
+pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => { roundpd(a, $imm4) }
     }
@@ -554,7 +581,7 @@ pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
-pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 {
+pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => { roundps(a, $imm4) }
     }
@@ -585,7 +612,7 @@ pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
-pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 {
+pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => { roundsd(a, b, $imm4) }
     }
@@ -616,14 +643,14 @@ pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
-pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
+pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => { roundss(a, b, $imm4) }
     }
     constify_imm4!(rounding, call)
 }
 
-/// Finds the minimum unsigned 16-bit element in the 128-bit u16x8 vector,
+/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
 /// returning a vector containing its value in its first position, and its
 /// index
 /// in its second position; all other elements are set to zero.
@@ -633,7 +660,7 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
 ///
 /// Arguments:
 ///
-/// * `a` - A 128-bit vector of type `u16x8`.
+/// * `a` - A 128-bit vector of type `__m128i`.
 ///
 /// Returns:
 ///
@@ -645,8 +672,8 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(phminposuw))]
-pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 {
-    phminposuw(a)
+pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
+    mem::transmute(phminposuw(a.as_u16x8()))
 }
 
 /// Multiply the low 32-bit integers from each packed 64-bit
@@ -654,21 +681,21 @@ pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmuldq))]
-pub unsafe fn _mm_mul_epi32(a: i32x4, b: i32x4) -> i64x2 {
-    pmuldq(a, b)
+pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate
 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
-/// reinterpreted as a signed integer. While `pmulld i32x4::splat(2),
-/// i32x4::splat(2)` returns the obvious `i32x4::splat(4)`, due to wrapping
-/// arithmetic `pmulld i32x4::splat(i32::MAX), i32x4::splat(2)` would return a
+/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
+/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
+/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would return a
 /// negative number.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmulld))]
-pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    a * b
+pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Subtracts 8-bit unsigned integer values and computes the absolute
@@ -691,8 +718,8 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 {
 ///
 /// Arguments:
 ///
-/// * `a` - A 128-bit vector of type `i8x16`.
-/// * `b` - A 128-bit vector of type `i8x16`.
+/// * `a` - A 128-bit vector of type `__m128i`.
+/// * `b` - A 128-bit vector of type `__m128i`.
 /// * `imm8` - An 8-bit immediate operand specifying how the absolute
 ///            differences are to be calculated
 ///     * Bit `[2]` specify the offset for operand `a`
@@ -700,16 +727,18 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 {
 ///
 /// Returns:
 ///
-/// * A `i16x8` vector containing the sums of the sets of
+/// * A `__m128i` vector containing the sums of the sets of
 ///   absolute differences between both operands.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
-pub unsafe fn _mm_mpsadbw_epu8(a: u8x16, b: u8x16, imm8: i32) -> u16x8 {
+pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let a = a.as_u8x16();
+    let b = b.as_u8x16();
     macro_rules! call {
         ($imm8:expr) => { mpsadbw(a, b, $imm8) }
     }
-    constify_imm3!(imm8, call)
+    mem::transmute(constify_imm3!(imm8, call))
 }
 
 #[allow(improper_ctypes)]
@@ -717,17 +746,17 @@ extern "C" {
     #[link_name = "llvm.x86.sse41.pblendvb"]
     fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
     #[link_name = "llvm.x86.sse41.blendvpd"]
-    fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2;
+    fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse41.blendvps"]
-    fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4;
+    fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
     #[link_name = "llvm.x86.sse41.blendpd"]
-    fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2;
+    fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
     #[link_name = "llvm.x86.sse41.blendps"]
-    fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4;
+    fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
     #[link_name = "llvm.x86.sse41.pblendw"]
     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
     #[link_name = "llvm.x86.sse41.insertps"]
-    fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
+    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
     #[link_name = "llvm.x86.sse41.pmaxsb"]
     fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
     #[link_name = "llvm.x86.sse41.pmaxuw"]
@@ -747,17 +776,17 @@ extern "C" {
     #[link_name = "llvm.x86.sse41.packusdw"]
     fn packusdw(a: i32x4, b: i32x4) -> u16x8;
     #[link_name = "llvm.x86.sse41.dppd"]
-    fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
+    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
     #[link_name = "llvm.x86.sse41.dpps"]
-    fn dpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
+    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
     #[link_name = "llvm.x86.sse41.round.pd"]
-    fn roundpd(a: f64x2, rounding: i32) -> f64x2;
+    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
     #[link_name = "llvm.x86.sse41.round.ps"]
-    fn roundps(a: f32x4, rounding: i32) -> f32x4;
+    fn roundps(a: __m128, rounding: i32) -> __m128;
     #[link_name = "llvm.x86.sse41.round.sd"]
-    fn roundsd(a: f64x2, b: f64x2, rounding: i32) -> f64x2;
+    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
     #[link_name = "llvm.x86.sse41.round.ss"]
-    fn roundss(a: f32x4, b: f32x4, rounding: i32) -> f32x4;
+    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
     #[link_name = "llvm.x86.sse41.phminposuw"]
     fn phminposuw(a: u16x8) -> u16x8;
     #[link_name = "llvm.x86.sse41.pmuldq"]
@@ -770,145 +799,144 @@ extern "C" {
 mod tests {
     use std::mem;
     use stdsimd_test::simd_test;
-    use x86::i586::sse41;
-    use v128::*;
+    use x86::*;
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_blendv_epi8() {
+    unsafe fn test_mm_blendv_epi8() {
         let a =
-            i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let b = i8x16::new(
+        let b = _mm_setr_epi8(
             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
         );
         let mask =
-            i8x16::new(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+            _mm_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let e = i8x16::new(
+        let e = _mm_setr_epi8(
             0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
         );
-        assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
+        assert_eq!(_mm_blendv_epi8(a, b, mask), e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_blendv_pd() {
-        let a = f64x2::splat(0.0);
-        let b = f64x2::splat(1.0);
-        let mask = mem::transmute(i64x2::new(0, -1));
-        let r = sse41::_mm_blendv_pd(a, b, mask);
-        let e = f64x2::new(0.0, 1.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_blendv_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let mask = mem::transmute(_mm_setr_epi64x(0, -1));
+        let r = _mm_blendv_pd(a, b, mask);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_blendv_ps() {
-        let a = f32x4::splat(0.0);
-        let b = f32x4::splat(1.0);
-        let mask = mem::transmute(i32x4::new(0, -1, 0, -1));
-        let r = sse41::_mm_blendv_ps(a, b, mask);
-        let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_blendv_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let mask = mem::transmute(_mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_blendv_ps(a, b, mask);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_blend_pd() {
-        let a = f64x2::splat(0.0);
-        let b = f64x2::splat(1.0);
-        let r = sse41::_mm_blend_pd(a, b, 0b10);
-        let e = f64x2::new(0.0, 1.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_blend_pd() {
+        let a = _mm_set1_pd(0.0);
+        let b = _mm_set1_pd(1.0);
+        let r = _mm_blend_pd(a, b, 0b10);
+        let e = _mm_setr_pd(0.0, 1.0);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_blend_ps() {
-        let a = f32x4::splat(0.0);
-        let b = f32x4::splat(1.0);
-        let r = sse41::_mm_blend_ps(a, b, 0b1010);
-        let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_blend_ps() {
+        let a = _mm_set1_ps(0.0);
+        let b = _mm_set1_ps(1.0);
+        let r = _mm_blend_ps(a, b, 0b1010);
+        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_blend_epi16() {
-        let a = i16x8::splat(0);
-        let b = i16x8::splat(1);
-        let r = sse41::_mm_blend_epi16(a, b, 0b1010_1100);
-        let e = i16x8::new(0, 0, 1, 1, 0, 1, 0, 1);
+    unsafe fn test_mm_blend_epi16() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let r = _mm_blend_epi16(a, b, 0b1010_1100);
+        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_extract_ps() {
-        let a = f32x4::new(0.0, 1.0, 2.0, 3.0);
-        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1));
+    unsafe fn test_mm_extract_ps() {
+        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
+        let r: f32 = mem::transmute(_mm_extract_ps(a, 1));
         assert_eq!(r, 1.0);
-        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5));
+        let r: f32 = mem::transmute(_mm_extract_ps(a, 5));
         assert_eq!(r, 1.0);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_extract_epi8() {
+    unsafe fn test_mm_extract_epi8() {
         let a =
-            i8x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r1 = sse41::_mm_extract_epi8(a, 0);
-        let r2 = sse41::_mm_extract_epi8(a, 19);
+            _mm_setr_epi8(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r1 = _mm_extract_epi8(a, 0);
+        let r2 = _mm_extract_epi8(a, 19);
         assert_eq!(r1, 0xFF);
         assert_eq!(r2, 3);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_extract_epi32() {
-        let a = i32x4::new(0, 1, 2, 3);
-        let r = sse41::_mm_extract_epi32(a, 1);
+    unsafe fn test_mm_extract_epi32() {
+        let a = _mm_setr_epi32(0, 1, 2, 3);
+        let r = _mm_extract_epi32(a, 1);
         assert_eq!(r, 1);
-        let r = sse41::_mm_extract_epi32(a, 5);
+        let r = _mm_extract_epi32(a, 5);
         assert_eq!(r, 1);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_insert_ps() {
-        let a = f32x4::splat(1.0);
-        let b = f32x4::new(1.0, 2.0, 3.0, 4.0);
-        let r = sse41::_mm_insert_ps(a, b, 0b11_00_1100);
-        let e = f32x4::new(4.0, 1.0, 0.0, 0.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_insert_ps() {
+        let a = _mm_set1_ps(1.0);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_insert_ps(a, b, 0b11_00_1100);
+        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_insert_epi8() {
-        let a = i8x16::splat(0);
-        let e = i8x16::splat(0).replace(1, 32);
-        let r = sse41::_mm_insert_epi8(a, 32, 1);
+    unsafe fn test_mm_insert_epi8() {
+        let a = _mm_set1_epi8(0);
+        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm_insert_epi8(a, 32, 1);
         assert_eq!(r, e);
-        let r = sse41::_mm_insert_epi8(a, 32, 17);
+        let r = _mm_insert_epi8(a, 32, 17);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_insert_epi32() {
-        let a = i32x4::splat(0);
-        let e = i32x4::splat(0).replace(1, 32);
-        let r = sse41::_mm_insert_epi32(a, 32, 1);
+    unsafe fn test_mm_insert_epi32() {
+        let a = _mm_set1_epi32(0);
+        let e = _mm_setr_epi32(0, 32, 0, 0);
+        let r = _mm_insert_epi32(a, 32, 1);
         assert_eq!(r, e);
-        let r = sse41::_mm_insert_epi32(a, 32, 5);
+        let r = _mm_insert_epi32(a, 32, 5);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_max_epi8() {
+    unsafe fn test_mm_max_epi8() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let a = i8x16::new(
+        let a = _mm_setr_epi8(
             1, 4, 5, 8, 9, 12, 13, 16,
             17, 20, 21, 24, 25, 28, 29, 32,
         );
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let b = i8x16::new(
+        let b = _mm_setr_epi8(
             2, 3, 6, 7, 10, 11, 14, 15,
             18, 19, 22, 23, 26, 27, 30, 31,
         );
-        let r = sse41::_mm_max_epi8(a, b);
+        let r = _mm_max_epi8(a, b);
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let e = i8x16::new(
+        let e = _mm_setr_epi8(
             2, 4, 6, 8, 10, 12, 14, 16,
             18, 20, 22, 24, 26, 28, 30, 32,
         );
@@ -916,47 +944,47 @@ mod tests {
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_max_epu16() {
-        let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = sse41::_mm_max_epu16(a, b);
-        let e = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+    unsafe fn test_mm_max_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_max_epu16(a, b);
+        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_max_epi32() {
-        let a = i32x4::new(1, 4, 5, 8);
-        let b = i32x4::new(2, 3, 6, 7);
-        let r = sse41::_mm_max_epi32(a, b);
-        let e = i32x4::new(2, 4, 6, 8);
+    unsafe fn test_mm_max_epi32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epi32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_max_epu32() {
-        let a = u32x4::new(1, 4, 5, 8);
-        let b = u32x4::new(2, 3, 6, 7);
-        let r = sse41::_mm_max_epu32(a, b);
-        let e = u32x4::new(2, 4, 6, 8);
+    unsafe fn test_mm_max_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_max_epu32(a, b);
+        let e = _mm_setr_epi32(2, 4, 6, 8);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_min_epi8_1() {
+    unsafe fn test_mm_min_epi8_1() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let a = i8x16::new(
+        let a = _mm_setr_epi8(
             1, 4, 5, 8, 9, 12, 13, 16,
             17, 20, 21, 24, 25, 28, 29, 32,
         );
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let b = i8x16::new(
+        let b = _mm_setr_epi8(
             2, 3, 6, 7, 10, 11, 14, 15,
             18, 19, 22, 23, 26, 27, 30, 31,
         );
-        let r = sse41::_mm_min_epi8(a, b);
+        let r = _mm_min_epi8(a, b);
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let e = i8x16::new(
+        let e = _mm_setr_epi8(
             1, 3, 5, 7, 9, 11, 13, 15,
             17, 19, 21, 23, 25, 27, 29, 31,
         );
@@ -964,20 +992,20 @@ mod tests {
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_min_epi8_2() {
+    unsafe fn test_mm_min_epi8_2() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let a = i8x16::new(
+        let a = _mm_setr_epi8(
             1, -4, -5, 8, -9, -12, 13, -16,
             17, 20, 21, 24, 25, 28, 29, 32,
         );
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let b = i8x16::new(
+        let b = _mm_setr_epi8(
             2, -3, -6, 7, -10, -11, 14, -15,
             18, 19, 22, 23, 26, 27, 30, 31,
         );
-        let r = sse41::_mm_min_epi8(a, b);
+        let r = _mm_min_epi8(a, b);
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let e = i8x16::new(
+        let e = _mm_setr_epi8(
             1, -4, -6, 7, -10, -12, 13, -16,
             17, 19, 21, 23, 25, 27, 29, 31,
         );
@@ -985,401 +1013,399 @@ mod tests {
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_min_epu16() {
-        let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16);
-        let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = sse41::_mm_min_epu16(a, b);
-        let e = u16x8::new(1, 3, 5, 7, 9, 11, 13, 15);
+    unsafe fn test_mm_min_epu16() {
+        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm_min_epu16(a, b);
+        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_min_epi32_1() {
-        let a = i32x4::new(1, 4, 5, 8);
-        let b = i32x4::new(2, 3, 6, 7);
-        let r = sse41::_mm_min_epi32(a, b);
-        let e = i32x4::new(1, 3, 5, 7);
+    unsafe fn test_mm_min_epi32_1() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_min_epi32_2() {
-        let a = i32x4::new(-1, 4, 5, -7);
-        let b = i32x4::new(-2, 3, -6, 8);
-        let r = sse41::_mm_min_epi32(a, b);
-        let e = i32x4::new(-2, 3, -6, -7);
+    unsafe fn test_mm_min_epi32_2() {
+        let a = _mm_setr_epi32(-1, 4, 5, -7);
+        let b = _mm_setr_epi32(-2, 3, -6, 8);
+        let r = _mm_min_epi32(a, b);
+        let e = _mm_setr_epi32(-2, 3, -6, -7);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_min_epu32() {
-        let a = u32x4::new(1, 4, 5, 8);
-        let b = u32x4::new(2, 3, 6, 7);
-        let r = sse41::_mm_min_epu32(a, b);
-        let e = u32x4::new(1, 3, 5, 7);
+    unsafe fn test_mm_min_epu32() {
+        let a = _mm_setr_epi32(1, 4, 5, 8);
+        let b = _mm_setr_epi32(2, 3, 6, 7);
+        let r = _mm_min_epu32(a, b);
+        let e = _mm_setr_epi32(1, 3, 5, 7);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_packus_epi32() {
-        let a = i32x4::new(1, 2, 3, 4);
-        let b = i32x4::new(-1, -2, -3, -4);
-        let r = sse41::_mm_packus_epi32(a, b);
-        let e = u16x8::new(1, 2, 3, 4, 0, 0, 0, 0);
+    unsafe fn test_mm_packus_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(-1, -2, -3, -4);
+        let r = _mm_packus_epi32(a, b);
+        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cmpeq_epi64() {
-        let a = i64x2::new(0, 1);
-        let b = i64x2::new(0, 0);
-        let r = sse41::_mm_cmpeq_epi64(a, b);
-        let e = i64x2::new(-1, 0);
+    unsafe fn test_mm_cmpeq_epi64() {
+        let a = _mm_setr_epi64x(0, 1);
+        let b = _mm_setr_epi64x(0, 0);
+        let r = _mm_cmpeq_epi64(a, b);
+        let e = _mm_setr_epi64x(-1, 0);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepi8_epi16() {
-        let a = i8x16::splat(10);
-        let r = sse41::_mm_cvtepi8_epi16(a);
-        let e = i16x8::splat(10);
+    unsafe fn test_mm_cvtepi8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(10);
         assert_eq!(r, e);
-        let a = i8x16::splat(-10);
-        let r = sse41::_mm_cvtepi8_epi16(a);
-        let e = i16x8::splat(-10);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi16(a);
+        let e = _mm_set1_epi16(-10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepi8_epi32() {
-        let a = i8x16::splat(10);
-        let r = sse41::_mm_cvtepi8_epi32(a);
-        let e = i32x4::splat(10);
+    unsafe fn test_mm_cvtepi8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(10);
         assert_eq!(r, e);
-        let a = i8x16::splat(-10);
-        let r = sse41::_mm_cvtepi8_epi32(a);
-        let e = i32x4::splat(-10);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi32(a);
+        let e = _mm_set1_epi32(-10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepi8_epi64() {
-        let a = i8x16::splat(10);
-        let r = sse41::_mm_cvtepi8_epi64(a);
-        let e = i64x2::splat(10);
+    unsafe fn test_mm_cvtepi8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(10);
         assert_eq!(r, e);
-        let a = i8x16::splat(-10);
-        let r = sse41::_mm_cvtepi8_epi64(a);
-        let e = i64x2::splat(-10);
+        let a = _mm_set1_epi8(-10);
+        let r = _mm_cvtepi8_epi64(a);
+        let e = _mm_set1_epi64x(-10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepi16_epi32() {
-        let a = i16x8::splat(10);
-        let r = sse41::_mm_cvtepi16_epi32(a);
-        let e = i32x4::splat(10);
+    unsafe fn test_mm_cvtepi16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(10);
         assert_eq!(r, e);
-        let a = i16x8::splat(-10);
-        let r = sse41::_mm_cvtepi16_epi32(a);
-        let e = i32x4::splat(-10);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi32(a);
+        let e = _mm_set1_epi32(-10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepi16_epi64() {
-        let a = i16x8::splat(10);
-        let r = sse41::_mm_cvtepi16_epi64(a);
-        let e = i64x2::splat(10);
+    unsafe fn test_mm_cvtepi16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(10);
         assert_eq!(r, e);
-        let a = i16x8::splat(-10);
-        let r = sse41::_mm_cvtepi16_epi64(a);
-        let e = i64x2::splat(-10);
+        let a = _mm_set1_epi16(-10);
+        let r = _mm_cvtepi16_epi64(a);
+        let e = _mm_set1_epi64x(-10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepi32_epi64() {
-        let a = i32x4::splat(10);
-        let r = sse41::_mm_cvtepi32_epi64(a);
-        let e = i64x2::splat(10);
+    unsafe fn test_mm_cvtepi32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(10);
         assert_eq!(r, e);
-        let a = i32x4::splat(-10);
-        let r = sse41::_mm_cvtepi32_epi64(a);
-        let e = i64x2::splat(-10);
+        let a = _mm_set1_epi32(-10);
+        let r = _mm_cvtepi32_epi64(a);
+        let e = _mm_set1_epi64x(-10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepu8_epi16() {
-        let a = u8x16::splat(10);
-        let r = sse41::_mm_cvtepu8_epi16(a);
-        let e = i16x8::splat(10);
+    unsafe fn test_mm_cvtepu8_epi16() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi16(a);
+        let e = _mm_set1_epi16(10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepu8_epi32() {
-        let a = u8x16::splat(10);
-        let r = sse41::_mm_cvtepu8_epi32(a);
-        let e = i32x4::splat(10);
+    unsafe fn test_mm_cvtepu8_epi32() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi32(a);
+        let e = _mm_set1_epi32(10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepu8_epi64() {
-        let a = u8x16::splat(10);
-        let r = sse41::_mm_cvtepu8_epi64(a);
-        let e = i64x2::splat(10);
+    unsafe fn test_mm_cvtepu8_epi64() {
+        let a = _mm_set1_epi8(10);
+        let r = _mm_cvtepu8_epi64(a);
+        let e = _mm_set1_epi64x(10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepu16_epi32() {
-        let a = u16x8::splat(10);
-        let r = sse41::_mm_cvtepu16_epi32(a);
-        let e = i32x4::splat(10);
+    unsafe fn test_mm_cvtepu16_epi32() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi32(a);
+        let e = _mm_set1_epi32(10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepu16_epi64() {
-        let a = u16x8::splat(10);
-        let r = sse41::_mm_cvtepu16_epi64(a);
-        let e = i64x2::splat(10);
+    unsafe fn test_mm_cvtepu16_epi64() {
+        let a = _mm_set1_epi16(10);
+        let r = _mm_cvtepu16_epi64(a);
+        let e = _mm_set1_epi64x(10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_cvtepu32_epi64() {
-        let a = u32x4::splat(10);
-        let r = sse41::_mm_cvtepu32_epi64(a);
-        let e = i64x2::splat(10);
+    unsafe fn test_mm_cvtepu32_epi64() {
+        let a = _mm_set1_epi32(10);
+        let r = _mm_cvtepu32_epi64(a);
+        let e = _mm_set1_epi64x(10);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_dp_pd() {
-        let a = f64x2::new(2.0, 3.0);
-        let b = f64x2::new(1.0, 4.0);
-        let e = f64x2::new(14.0, 0.0);
-        assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
+    unsafe fn test_mm_dp_pd() {
+        let a = _mm_setr_pd(2.0, 3.0);
+        let b = _mm_setr_pd(1.0, 4.0);
+        let e = _mm_setr_pd(14.0, 0.0);
+        assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_dp_ps() {
-        let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
-        let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
-        let e = f32x4::new(14.5, 0.0, 14.5, 0.0);
-        assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e);
+    unsafe fn test_mm_dp_ps() {
+        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
+        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
+        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
+        assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_floor_pd() {
-        let a = f64x2::new(2.5, 4.5);
-        let r = sse41::_mm_floor_pd(a);
-        let e = f64x2::new(2.0, 4.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_floor_pd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let r = _mm_floor_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_floor_ps() {
-        let a = f32x4::new(2.5, 4.5, 8.5, 16.5);
-        let r = sse41::_mm_floor_ps(a);
-        let e = f32x4::new(2.0, 4.0, 8.0, 16.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_floor_ps() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let r = _mm_floor_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_floor_sd() {
-        let a = f64x2::new(2.5, 4.5);
-        let b = f64x2::new(-1.5, -3.5);
-        let r = sse41::_mm_floor_sd(a, b);
-        let e = f64x2::new(-2.0, 4.5);
-        assert_eq!(r, e);
+    unsafe fn test_mm_floor_sd() {
+        let a = _mm_setr_pd(2.5, 4.5);
+        let b = _mm_setr_pd(-1.5, -3.5);
+        let r = _mm_floor_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 4.5);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_floor_ss() {
-        let a = f32x4::new(2.5, 4.5, 8.5, 16.5);
-        let b = f32x4::new(-1.5, -3.5, -7.5, -15.5);
-        let r = sse41::_mm_floor_ss(a, b);
-        let e = f32x4::new(-2.0, 4.5, 8.5, 16.5);
-        assert_eq!(r, e);
+    unsafe fn test_mm_floor_ss() {
+        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
+        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
+        let r = _mm_floor_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_ceil_pd() {
-        let a = f64x2::new(1.5, 3.5);
-        let r = sse41::_mm_ceil_pd(a);
-        let e = f64x2::new(2.0, 4.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_ceil_pd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let r = _mm_ceil_pd(a);
+        let e = _mm_setr_pd(2.0, 4.0);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_ceil_ps() {
-        let a = f32x4::new(1.5, 3.5, 7.5, 15.5);
-        let r = sse41::_mm_ceil_ps(a);
-        let e = f32x4::new(2.0, 4.0, 8.0, 16.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_ceil_ps() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let r = _mm_ceil_ps(a);
+        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_ceil_sd() {
-        let a = f64x2::new(1.5, 3.5);
-        let b = f64x2::new(-2.5, -4.5);
-        let r = sse41::_mm_ceil_sd(a, b);
-        let e = f64x2::new(-2.0, 3.5);
-        assert_eq!(r, e);
+    unsafe fn test_mm_ceil_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let r = _mm_ceil_sd(a, b);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_ceil_ss() {
-        let a = f32x4::new(1.5, 3.5, 7.5, 15.5);
-        let b = f32x4::new(-2.5, -4.5, -8.5, -16.5);
-        let r = sse41::_mm_ceil_ss(a, b);
-        let e = f32x4::new(-2.0, 3.5, 7.5, 15.5);
-        assert_eq!(r, e);
+    unsafe fn test_mm_ceil_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
+        let r = _mm_ceil_ss(a, b);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_round_pd() {
-        let a = f64x2::new(1.25, 3.75);
-        let r = sse41::_mm_round_pd(a, sse41::_MM_FROUND_TO_NEAREST_INT);
-        let e = f64x2::new(1.0, 4.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_round_pd() {
+        let a = _mm_setr_pd(1.25, 3.75);
+        let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+        let e = _mm_setr_pd(1.0, 4.0);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_round_ps() {
-        let a = f32x4::new(2.25, 4.75, -1.75, -4.25);
-        let r = sse41::_mm_round_ps(a, sse41::_MM_FROUND_TO_ZERO);
-        let e = f32x4::new(2.0, 4.0, -1.0, -4.0);
-        assert_eq!(r, e);
+    unsafe fn test_mm_round_ps() {
+        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
+        let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
+        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_round_sd() {
-        use x86::i586::sse;
-        let a = f64x2::new(1.5, 3.5);
-        let b = f64x2::new(-2.5, -4.5);
-        let old_mode = sse::_MM_GET_ROUNDING_MODE();
-        sse::_MM_SET_ROUNDING_MODE(sse::_MM_ROUND_TOWARD_ZERO);
-        let r = sse41::_mm_round_sd(a, b, sse41::_MM_FROUND_CUR_DIRECTION);
-        sse::_MM_SET_ROUNDING_MODE(old_mode);
-        let e = f64x2::new(-2.0, 3.5);
-        assert_eq!(r, e);
+    unsafe fn test_mm_round_sd() {
+        let a = _mm_setr_pd(1.5, 3.5);
+        let b = _mm_setr_pd(-2.5, -4.5);
+        let old_mode = _MM_GET_ROUNDING_MODE();
+        _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+        let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        _MM_SET_ROUNDING_MODE(old_mode);
+        let e = _mm_setr_pd(-2.0, 3.5);
+        assert_eq_m128d(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_round_ss() {
-        use x86::i586::sse;
-        let a = f32x4::new(1.5, 3.5, 7.5, 15.5);
-        let b = f32x4::new(-1.75, -4.5, -8.5, -16.5);
-        let old_mode = sse::_MM_GET_ROUNDING_MODE();
-        sse::_MM_SET_ROUNDING_MODE(sse::_MM_ROUND_NEAREST);
-        let r = sse41::_mm_round_ss(a, b, sse41::_MM_FROUND_CUR_DIRECTION);
-        sse::_MM_SET_ROUNDING_MODE(old_mode);
-        let e = f32x4::new(-2.0, 3.5, 7.5, 15.5);
-        assert_eq!(r, e);
+    unsafe fn test_mm_round_ss() {
+        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
+        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
+        let old_mode = _MM_GET_ROUNDING_MODE();
+        _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+        let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        _MM_SET_ROUNDING_MODE(old_mode);
+        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_minpos_epu16_1() {
-        let a = u16x8::new(23, 18, 44, 97, 50, 13, 67, 66);
-        let r = sse41::_mm_minpos_epu16(a);
-        let e = u16x8::new(13, 5, 0, 0, 0, 0, 0, 0);
+    unsafe fn test_mm_minpos_epu16_1() {
+        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_minpos_epu16_2() {
-        let a = u16x8::new(0, 18, 44, 97, 50, 13, 67, 66);
-        let r = sse41::_mm_minpos_epu16(a);
-        let e = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    unsafe fn test_mm_minpos_epu16_2() {
+        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_mul_epi32() {
+    unsafe fn test_mm_mul_epi32() {
         {
-            let a = i32x4::new(1, 1, 1, 1);
-            let b = i32x4::new(1, 2, 3, 4);
-            let r = sse41::_mm_mul_epi32(a, b);
-            let e = i64x2::new(1, 3);
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(1, 3);
             assert_eq!(r, e);
         }
         {
-            let a = i32x4::new(
+            let a = _mm_setr_epi32(
                 15,
                 2, /* ignored */
                 1234567,
                 4, /* ignored */
             );
-            let b = i32x4::new(
+            let b = _mm_setr_epi32(
                 -20,
                 -256, /* ignored */
                 666666,
                 666666, /* ignored */
             );
-            let r = sse41::_mm_mul_epi32(a, b);
-            let e = i64x2::new(-300, 823043843622);
+            let r = _mm_mul_epi32(a, b);
+            let e = _mm_setr_epi64x(-300, 823043843622);
             assert_eq!(r, e);
         }
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_mullo_epi32() {
+    unsafe fn test_mm_mullo_epi32() {
         {
-            let a = i32x4::new(1, 1, 1, 1);
-            let b = i32x4::new(1, 2, 3, 4);
-            let r = sse41::_mm_mullo_epi32(a, b);
-            let e = i32x4::new(1, 2, 3, 4);
+            let a = _mm_setr_epi32(1, 1, 1, 1);
+            let b = _mm_setr_epi32(1, 2, 3, 4);
+            let r = _mm_mullo_epi32(a, b);
+            let e = _mm_setr_epi32(1, 2, 3, 4);
             assert_eq!(r, e);
         }
         {
-            let a = i32x4::new(15, -2, 1234567, 99999);
-            let b = i32x4::new(-20, -256, 666666, -99999);
-            let r = sse41::_mm_mullo_epi32(a, b);
+            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
+            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
+            let r = _mm_mullo_epi32(a, b);
             // Attention, most significant bit in r[2] is treated
             // as a sign bit:
             // 1234567 * 666666 = -1589877210
-            let e = i32x4::new(-300, 512, -1589877210, -1409865409);
+            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
             assert_eq!(r, e);
         }
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_minpos_epu16() {
-        let a = u16x8::new(8, 7, 6, 5, 4, 1, 2, 3);
-        let r = sse41::_mm_minpos_epu16(a);
-        let e = u16x8::splat(0).replace(0, 1).replace(1, 5);
+    unsafe fn test_mm_minpos_epu16() {
+        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
+        let r = _mm_minpos_epu16(a);
+        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
-    unsafe fn _mm_mpsadbw_epu8() {
+    unsafe fn test_mm_mpsadbw_epu8() {
         let a =
-            u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
-        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000);
-        let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r = _mm_mpsadbw_epu8(a, a, 0b000);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
         assert_eq!(r, e);
 
-        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001);
-        let e = u16x8::new(16, 12, 8, 4, 0, 4, 8, 12);
+        let r = _mm_mpsadbw_epu8(a, a, 0b001);
+        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
         assert_eq!(r, e);
 
-        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100);
-        let e = u16x8::new(16, 20, 24, 28, 32, 36, 40, 44);
+        let r = _mm_mpsadbw_epu8(a, a, 0b100);
+        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
         assert_eq!(r, e);
 
-        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b101);
-        let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        let r = _mm_mpsadbw_epu8(a, a, 0b101);
+        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
         assert_eq!(r, e);
 
-        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b111);
-        let e = u16x8::new(32, 28, 24, 20, 16, 12, 8, 4);
+        let r = _mm_mpsadbw_epu8(a, a, 0b111);
+        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
         assert_eq!(r, e);
     }
 }
diff --git a/coresimd/src/x86/test.rs b/coresimd/src/x86/test.rs
index a4a6e358b5297..f5614861b96ad 100644
--- a/coresimd/src/x86/test.rs
+++ b/coresimd/src/x86/test.rs
@@ -30,3 +30,10 @@ pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
     union A { a: __m128, b: [f32; 4] };
     mem::transmute::<__m128, A>(a).b[idx]
 }
+
+// not actually an intrinsic but useful in various tests as we proted from
+// `i64x2::new` which is backwards from `_mm_set_epi64x`
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}