Skip to content

Commit 6a53ace

Browse files
committedNov 5, 2023
Implement _mm256_permute2f128_ps and _mm256_permute2f128_pd intrinsics
1 parent f6a8c3a commit 6a53ace

File tree

2 files changed

+75
-19
lines changed

2 files changed

+75
-19
lines changed
 

‎src/intrinsics/llvm_x86.rs

+19-19
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
172172
}
173173
}
174174
}
175-
"llvm.x86.avx2.vperm2i128" => {
175+
"llvm.x86.avx2.vperm2i128"
176+
| "llvm.x86.avx.vperm2f128.ps.256"
177+
| "llvm.x86.avx.vperm2f128.pd.256" => {
176178
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
179+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
180+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
177181
let (a, b, imm8) = match args {
178182
[a, b, imm8] => (a, b, imm8),
179183
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
@@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
182186
let b = codegen_operand(fx, b);
183187
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
184188

185-
let a_0 = a.value_lane(fx, 0).load_scalar(fx);
186-
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
187-
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
188-
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
189-
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
190-
let a_high = fx.bcx.ins().iconcat(a_2, a_3);
189+
let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
190+
let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
191191

192-
let b_0 = b.value_lane(fx, 0).load_scalar(fx);
193-
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
194-
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
195-
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
196-
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
197-
let b_high = fx.bcx.ins().iconcat(b_2, b_3);
192+
let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
193+
let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
198194

199195
fn select4(
200196
fx: &mut FunctionCx<'_, '_, '_>,
@@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
219215

220216
let control0 = imm8;
221217
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
222-
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
223218

224219
let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
225220
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
226-
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
227221

228-
ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
229-
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
230-
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
231-
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
222+
ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
223+
fx,
224+
res_low,
225+
MemFlags::trusted(),
226+
);
227+
ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
228+
fx,
229+
res_high,
230+
MemFlags::trusted(),
231+
);
232232
}
233233
"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
234234
let a = match args {

‎src/value_and_place.rs

+56
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
243243
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
244244
let lane_layout = fx.layout_of(lane_ty);
245245
assert!(lane_idx < lane_count);
246+
247+
match self.0 {
248+
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
249+
CValueInner::ByRef(ptr, None) => {
250+
let field_offset = lane_layout.size * lane_idx;
251+
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
252+
CValue::by_ref(field_ptr, lane_layout)
253+
}
254+
CValueInner::ByRef(_, Some(_)) => unreachable!(),
255+
}
256+
}
257+
258+
/// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
259+
/// specified by the vector type.
260+
pub(crate) fn value_typed_lane(
261+
self,
262+
fx: &mut FunctionCx<'_, '_, 'tcx>,
263+
lane_ty: Ty<'tcx>,
264+
lane_idx: u64,
265+
) -> CValue<'tcx> {
266+
let layout = self.1;
267+
assert!(layout.ty.is_simd());
268+
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
269+
let lane_layout = fx.layout_of(lane_ty);
270+
assert!(
271+
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
272+
);
273+
246274
match self.0 {
247275
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
248276
CValueInner::ByRef(ptr, None) => {
@@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
734762
}
735763
}
736764

765+
/// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
766+
/// specified by the vector type.
767+
pub(crate) fn place_typed_lane(
768+
self,
769+
fx: &mut FunctionCx<'_, '_, 'tcx>,
770+
lane_ty: Ty<'tcx>,
771+
lane_idx: u64,
772+
) -> CPlace<'tcx> {
773+
let layout = self.layout();
774+
assert!(layout.ty.is_simd());
775+
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
776+
let lane_layout = fx.layout_of(lane_ty);
777+
assert!(
778+
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
779+
);
780+
781+
match self.inner {
782+
CPlaceInner::Var(_, _) => unreachable!(),
783+
CPlaceInner::VarPair(_, _, _) => unreachable!(),
784+
CPlaceInner::Addr(ptr, None) => {
785+
let field_offset = lane_layout.size * lane_idx;
786+
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
787+
CPlace::for_ptr(field_ptr, lane_layout)
788+
}
789+
CPlaceInner::Addr(_, Some(_)) => unreachable!(),
790+
}
791+
}
792+
737793
pub(crate) fn place_index(
738794
self,
739795
fx: &mut FunctionCx<'_, '_, 'tcx>,

0 commit comments

Comments
 (0)
Please sign in to comment.