diff --git a/build.rs b/build.rs index f658c3a6dac1..95c4b03b6351 100644 --- a/build.rs +++ b/build.rs @@ -202,8 +202,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { // to be a big chunk of work to implement them all there! ("simd", _) if target.contains("aarch64") => return true, - ("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S - // TODO(#1886): Ignore reference types tests if this isn't x64, // because Cranelift only supports reference types on x64. ("reference_types", _) => { diff --git a/cranelift/codegen/meta/src/cdsl/typevar.rs b/cranelift/codegen/meta/src/cdsl/typevar.rs index 0c0b2e96471e..752b458b2a75 100644 --- a/cranelift/codegen/meta/src/cdsl/typevar.rs +++ b/cranelift/codegen/meta/src/cdsl/typevar.rs @@ -211,6 +211,24 @@ impl TypeVar { "can't double 256 lanes" ); } + DerivedFunc::MergeLanes => { + assert!( + ts.ints.is_empty() || *ts.ints.iter().max().unwrap() < MAX_BITS, + "can't double all integer types" + ); + assert!( + ts.floats.is_empty() || *ts.floats.iter().max().unwrap() < MAX_FLOAT_BITS, + "can't double all float types" + ); + assert!( + ts.bools.is_empty() || *ts.bools.iter().max().unwrap() < MAX_BITS, + "can't double all boolean types" + ); + assert!( + *ts.lanes.iter().min().unwrap() > 1, + "can't halve a scalar type" + ); + } DerivedFunc::LaneOf | DerivedFunc::AsBool => { /* no particular assertions */ } } @@ -248,6 +266,9 @@ impl TypeVar { pub fn split_lanes(&self) -> TypeVar { self.derived(DerivedFunc::SplitLanes) } + pub fn merge_lanes(&self) -> TypeVar { + self.derived(DerivedFunc::MergeLanes) + } /// Constrain the range of types this variable can assume to a subset of those in the typeset /// ts. @@ -355,6 +376,7 @@ pub(crate) enum DerivedFunc { HalfVector, DoubleVector, SplitLanes, + MergeLanes, } impl DerivedFunc { @@ -367,6 +389,7 @@ impl DerivedFunc { DerivedFunc::HalfVector => "half_vector", DerivedFunc::DoubleVector => "double_vector", DerivedFunc::SplitLanes => "split_lanes", + DerivedFunc::MergeLanes => "merge_lanes", } } @@ -377,6 +400,8 @@ impl DerivedFunc { DerivedFunc::DoubleWidth => Some(DerivedFunc::HalfWidth), DerivedFunc::HalfVector => Some(DerivedFunc::DoubleVector), DerivedFunc::DoubleVector => Some(DerivedFunc::HalfVector), + DerivedFunc::MergeLanes => Some(DerivedFunc::SplitLanes), + DerivedFunc::SplitLanes => Some(DerivedFunc::MergeLanes), _ => None, } } @@ -462,6 +487,7 @@ impl TypeSet { DerivedFunc::HalfVector => self.half_vector(), DerivedFunc::DoubleVector => self.double_vector(), DerivedFunc::SplitLanes => self.half_width().double_vector(), + DerivedFunc::MergeLanes => self.double_width().half_vector(), } } @@ -601,7 +627,8 @@ impl TypeSet { DerivedFunc::DoubleWidth => self.half_width(), DerivedFunc::HalfVector => self.double_vector(), DerivedFunc::DoubleVector => self.half_vector(), - DerivedFunc::SplitLanes => self.half_vector().double_width(), + DerivedFunc::SplitLanes => self.double_width().half_vector(), + DerivedFunc::MergeLanes => self.half_width().double_vector(), } } diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 8f4a77d814a4..da04019a1b26 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1669,6 +1669,7 @@ fn define_simd( let ssub_sat = shared.by_name("ssub_sat"); let store = shared.by_name("store"); let store_complex = shared.by_name("store_complex"); + let swiden_low = shared.by_name("swiden_low"); let uadd_sat = shared.by_name("uadd_sat"); let uload8x8 = shared.by_name("uload8x8"); let uload8x8_complex = shared.by_name("uload8x8_complex"); @@ -1678,6 +1679,7 @@ fn define_simd( let uload32x2_complex = shared.by_name("uload32x2_complex"); let snarrow = shared.by_name("snarrow"); let unarrow = shared.by_name("unarrow"); + let uwiden_low = shared.by_name("uwiden_low"); let ushr_imm = shared.by_name("ushr_imm"); let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); @@ -1697,6 +1699,7 @@ fn define_simd( let x86_pminu = x86.by_name("x86_pminu"); let x86_pmullq = x86.by_name("x86_pmullq"); let x86_pmuludq = x86.by_name("x86_pmuludq"); + let x86_palignr = x86.by_name("x86_palignr"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_psll = x86.by_name("x86_psll"); @@ -1901,6 +1904,8 @@ fn define_simd( rec_fa.opcodes(low), ); } + + // SIMD narrow/widen for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] { let snarrow = snarrow.bind(vector(*ty, sse_vector_size)); e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes)); @@ -1912,6 +1917,23 @@ fn define_simd( let unarrow = unarrow.bind(vector(*ty, sse_vector_size)); e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap); } + for (ty, swiden_opcode, uwiden_opcode) in &[ + (I8, &PMOVSXBW[..], &PMOVZXBW[..]), + (I16, &PMOVSXWD[..], &PMOVZXWD[..]), + ] { + let isap = Some(use_sse41_simd); + let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap); + let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap); + } + for ty in &[I8, I16, I32, I64] { + e.enc_both_inferred_maybe_isap( + x86_palignr.bind(vector(*ty, sse_vector_size)), + rec_fa_ib.opcodes(&PALIGNR[..]), + Some(use_ssse3_simd), + ); + } // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 0e48784f2337..7acd2e2c5088 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -664,6 +664,21 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let c = &Operand::new("c", uimm8) + .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details"); + ig.push( + Inst::new( + "x86_palignr", + r#" + Concatenate destination and source operands, extracting a byte-aligned result shifted to + the right by `c`. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![x, y, c]) + .operands_out(vec![a]), + ); + let i64_t = &TypeVar::new( "i64_t", "A scalar 64bit integer", diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 20f87ac265c5..de78c3b3b7f9 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -407,13 +407,18 @@ fn define_simd( let umax = insts.by_name("umax"); let umin = insts.by_name("umin"); let snarrow = insts.by_name("snarrow"); + let swiden_high = insts.by_name("swiden_high"); + let swiden_low = insts.by_name("swiden_low"); let ushr_imm = insts.by_name("ushr_imm"); let ushr = insts.by_name("ushr"); + let uwiden_high = insts.by_name("uwiden_high"); + let uwiden_low = insts.by_name("uwiden_low"); let vconst = insts.by_name("vconst"); let vall_true = insts.by_name("vall_true"); let vany_true = insts.by_name("vany_true"); let vselect = insts.by_name("vselect"); + let x86_palignr = x86_instructions.by_name("x86_palignr"); let x86_pmaxs = x86_instructions.by_name("x86_pmaxs"); let x86_pmaxu = x86_instructions.by_name("x86_pmaxu"); let x86_pmins = x86_instructions.by_name("x86_pmins"); @@ -786,6 +791,26 @@ fn define_simd( ); } + // SIMD widen + for ty in &[I8, I16] { + let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(b = swiden_high(a)), + vec![ + def!(c = x86_palignr(a, a, uimm8_eight)), + def!(b = swiden_low(c)), + ], + ); + let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(b = uwiden_high(a)), + vec![ + def!(c = x86_palignr(a, a, uimm8_eight)), + def!(b = uwiden_low(c)), + ], + ); + } + narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(insertlane, "convert_insertlane"); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index c357488ddd4e..09c07c458fa2 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -354,6 +354,10 @@ pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc]; /// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE). pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd]; +/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is +/// shifted to the right by the constant number of bytes in imm8 (SSSE3). +pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f]; + /// Bitwise AND of xmm2/m128 and xmm1 (SSE2). pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb]; @@ -473,7 +477,7 @@ pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20]; pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23]; /// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit -/// integers in xmm1. +/// integers in xmm1 (SSE4.1). pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25]; /// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit @@ -485,7 +489,7 @@ pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30]; pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33]; /// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit -/// integers in xmm1. +/// integers in xmm1 (SSE4.1). pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35]; /// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index c78787ce823f..1c06c4a32576 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3883,9 +3883,9 @@ pub(crate) fn define( .constraints(vec![WiderOrEq(Int.clone(), IntTo.clone())]), ); - let I16xN = &TypeVar::new( - "I16xN", - "A SIMD vector type containing integers 16-bits wide and up", + let I16or32xN = &TypeVar::new( + "I16or32xN", + "A SIMD vector type containing integer lanes 16 or 32 bits wide", TypeSetBuilder::new() .ints(16..32) .simd_lanes(4..8) @@ -3893,9 +3893,9 @@ pub(crate) fn define( .build(), ); - let x = &Operand::new("x", I16xN); - let y = &Operand::new("y", I16xN); - let a = &Operand::new("a", &I16xN.split_lanes()); + let x = &Operand::new("x", I16or32xN); + let y = &Operand::new("y", I16or32xN); + let a = &Operand::new("a", &I16or32xN.split_lanes()); ig.push( Inst::new( @@ -3934,6 +3934,75 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let I8or16xN = &TypeVar::new( + "I8or16xN", + "A SIMD vector type containing integer lanes 8 or 16 bits wide.", + TypeSetBuilder::new() + .ints(8..16) + .simd_lanes(8..16) + .includes_scalars(false) + .build(), + ); + + let x = &Operand::new("x", I8or16xN); + let a = &Operand::new("a", &I8or16xN.merge_lanes()); + + ig.push( + Inst::new( + "swiden_low", + r#" + Widen the low lanes of `x` using signed extension. + + This will double the lane width and halve the number of lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "swiden_high", + r#" + Widen the high lanes of `x` using signed extension. + + This will double the lane width and halve the number of lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "uwiden_low", + r#" + Widen the low lanes of `x` using unsigned extension. + + This will double the lane width and halve the number of lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "uwiden_high", + r#" + Widen the high lanes of `x` using unsigned extension. + + This will double the lane width and halve the number of lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + let IntTo = &TypeVar::new( "IntTo", "A larger integer type with the same number of lanes", diff --git a/cranelift/codegen/src/ir/instructions.rs b/cranelift/codegen/src/ir/instructions.rs index f835bd5f4a46..2ba730b687e7 100644 --- a/cranelift/codegen/src/ir/instructions.rs +++ b/cranelift/codegen/src/ir/instructions.rs @@ -584,6 +584,9 @@ enum OperandConstraint { /// This operand is `ctrlType.split_lanes()`. SplitLanes, + + /// This operand is `ctrlType.merge_lanes()`. + MergeLanes, } impl OperandConstraint { @@ -615,6 +618,11 @@ impl OperandConstraint { .split_lanes() .expect("invalid type for split_lanes"), ), + MergeLanes => Bound( + ctrl_type + .merge_lanes() + .expect("invalid type for merge_lanes"), + ), } } } diff --git a/cranelift/codegen/src/ir/types.rs b/cranelift/codegen/src/ir/types.rs index 319f3ae66f69..c669839da5a3 100644 --- a/cranelift/codegen/src/ir/types.rs +++ b/cranelift/codegen/src/ir/types.rs @@ -284,7 +284,7 @@ impl Type { /// Split the lane width in half and double the number of lanes to maintain the same bit-width. /// - /// If this is a scalar type of n bits, it produces a SIMD vector type of (n/2)x2. + /// If this is a scalar type of `n` bits, it produces a SIMD vector type of `(n/2)x2`. pub fn split_lanes(self) -> Option { match self.half_width() { Some(half_width) => half_width.by(2), @@ -292,6 +292,17 @@ impl Type { } } + /// Merge lanes to half the number of lanes and double the lane width to maintain the same + /// bit-width. + /// + /// If this is a scalar type, it will return `None`. + pub fn merge_lanes(self) -> Option { + match self.double_width() { + Some(double_width) => double_width.half_vector(), + None => None, + } + } + /// Index of this type, for use with hash tables etc. pub fn index(self) -> usize { usize::from(self.0) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 80b4518f9ff0..88751a1478dc 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2133,6 +2133,7 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::X86Insertps | Opcode::X86Movsd | Opcode::X86Movlhps + | Opcode::X86Palignr | Opcode::X86Psll | Opcode::X86Psrl | Opcode::X86Psra @@ -2153,7 +2154,12 @@ pub(crate) fn lower_insn_to_regs>( Opcode::AvgRound => unimplemented!(), Opcode::Iabs => unimplemented!(), - Opcode::Snarrow | Opcode::Unarrow => unimplemented!(), + Opcode::Snarrow + | Opcode::Unarrow + | Opcode::SwidenLow + | Opcode::SwidenHigh + | Opcode::UwidenLow + | Opcode::UwidenHigh => unimplemented!(), Opcode::TlsValue => unimplemented!(), } diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif index ae1cdda753fc..72e3412279dd 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif @@ -1,5 +1,6 @@ test binemit -target x86_64 +set enable_simd +target x86_64 nehalem ; Ensure raw_bitcast emits no instructions. function %raw_bitcast_i16x8_to_b32x4() { @@ -10,8 +11,16 @@ block0: return } -function %fcvt_32(i32x4) { -block0(v0: i32x4 [%xmm6]): -[-, %xmm2] v1 = fcvt_from_sint.f32x4 v0 ; bin: 40 0f 5b d6 +function %conversions_i32x4(i32x4, i32x4) { +block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]): +[-, %xmm2] v2 = fcvt_from_sint.f32x4 v0 ; bin: 40 0f 5b d6 +[-, %xmm6] v3 = x86_palignr v0, v1, 3 ; bin: 66 0f 3a 0f f4 03 + return +} + +function %conversions_i16x8(i16x8) { +block0(v0: i16x8 [%xmm6]): +[-, %xmm2] v1 = swiden_low v0 ; bin: 66 0f 38 23 d6 +[-, %xmm11] v2 = uwiden_low v0 ; bin: 66 44 0f 38 33 de return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif index ccea16de2c20..011510781009 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif @@ -52,3 +52,19 @@ block0(v0:f32x4): ; nextln: v1 = iadd v12, v11 return v1 } + +function %uwiden_high(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_high v0 + ; check: v2 = x86_palignr v0, v0, 8 + ; nextln: v1 = uwiden_low v2 + return v1 +} + +function %swiden_high(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_high v0 + ; check: v2 = x86_palignr v0, v0, 8 + ; nextln: v1 = swiden_low v2 + return v1 +} diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 64556bdddb79..79eae5c2a6b9 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1582,17 +1582,39 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, I32X4, builder); state.push1(builder.ins().unarrow(a, b)) } - Operator::I16x8WidenLowI8x16S { .. } - | Operator::I16x8WidenHighI8x16S { .. } - | Operator::I16x8WidenLowI8x16U { .. } - | Operator::I16x8WidenHighI8x16U { .. } - | Operator::I32x4WidenLowI16x8S { .. } - | Operator::I32x4WidenHighI16x8S { .. } - | Operator::I32x4WidenLowI16x8U { .. } - | Operator::I32x4WidenHighI16x8U { .. } - | Operator::I8x16Bitmask - | Operator::I16x8Bitmask - | Operator::I32x4Bitmask => { + Operator::I16x8WidenLowI8x16S => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().swiden_low(a)) + } + Operator::I16x8WidenHighI8x16S => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().swiden_high(a)) + } + Operator::I16x8WidenLowI8x16U => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().uwiden_low(a)) + } + Operator::I16x8WidenHighI8x16U => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().uwiden_high(a)) + } + Operator::I32x4WidenLowI16x8S => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().swiden_low(a)) + } + Operator::I32x4WidenHighI16x8S => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().swiden_high(a)) + } + Operator::I32x4WidenLowI16x8U => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().uwiden_low(a)) + } + Operator::I32x4WidenHighI16x8U => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().uwiden_high(a)) + } + Operator::I8x16Bitmask | Operator::I16x8Bitmask | Operator::I32x4Bitmask => { return Err(wasm_unsupported!("proposed SIMD operator {:?}", op)); }