Skip to content

Commit 61e38ce

Browse files
committed
Implement all SSE intrinsics used by the jpeg-decoder crate
1 parent 4381949 commit 61e38ce

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed

src/intrinsics/llvm_x86.rs

+71
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,77 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
413413
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
414414
}
415415
}
416+
417+
"llvm.x86.ssse3.pmul.hr.sw.128" => {
418+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782
419+
intrinsic_args!(fx, args => (a, b); intrinsic);
420+
421+
assert_eq!(a.layout(), b.layout());
422+
let layout = a.layout();
423+
424+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
425+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
426+
assert_eq!(lane_ty, fx.tcx.types.i16);
427+
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
428+
assert_eq!(lane_count, ret_lane_count);
429+
430+
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
431+
for out_lane_idx in 0..lane_count {
432+
let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);
433+
let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);
434+
let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);
435+
let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);
436+
437+
let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);
438+
let shifted = fx.bcx.ins().ushr_imm(mul, 14);
439+
let incremented = fx.bcx.ins().iadd_imm(shifted, 1);
440+
let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);
441+
442+
let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);
443+
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
444+
445+
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
446+
}
447+
}
448+
449+
"llvm.x86.sse2.packuswb.128" => {
450+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
451+
intrinsic_args!(fx, args => (a, b); intrinsic);
452+
453+
assert_eq!(a.layout(), b.layout());
454+
let layout = a.layout();
455+
456+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
457+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
458+
assert_eq!(lane_ty, fx.tcx.types.i16);
459+
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
460+
assert_eq!(lane_count * 2, ret_lane_count);
461+
462+
let zero = fx.bcx.ins().iconst(types::I16, 0);
463+
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
464+
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
465+
466+
for idx in 0..lane_count {
467+
let lane = a.value_lane(fx, idx).load_scalar(fx);
468+
let sat = fx.bcx.ins().smax(lane, zero);
469+
let sat = fx.bcx.ins().umin(sat, max_u8);
470+
let res = fx.bcx.ins().ireduce(types::I8, sat);
471+
472+
let res_lane = CValue::by_val(res, ret_lane_layout);
473+
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
474+
}
475+
476+
for idx in 0..lane_count {
477+
let lane = b.value_lane(fx, idx).load_scalar(fx);
478+
let sat = fx.bcx.ins().smax(lane, zero);
479+
let sat = fx.bcx.ins().umin(sat, max_u8);
480+
let res = fx.bcx.ins().ireduce(types::I8, sat);
481+
482+
let res_lane = CValue::by_val(res, ret_lane_layout);
483+
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
484+
}
485+
}
486+
416487
_ => {
417488
fx.tcx
418489
.sess

0 commit comments

Comments
 (0)