Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mono] Intrinsify multiply and divide for mini JIT on ARM64 #84004

Merged
merged 5 commits into from
Mar 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0111, (rd), (rn))
#define arm_neon_dup_e(p, width, type, rd, rn, index) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)) | ((index) << ((type)+1)), 0b0000, (rd), (rn))
#define arm_neon_fdup_e(p, width, type, rd, rn, index) arm_neon_dup_e ((p), (width), (type) + TYPE_I32, (rd), (rn), (index))
#define arm_neon_dup_g(p, width, type, rd, rn) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)), 0b0001, (rd), (rn))

// Specific opcodes:
#define arm_neon_dup_g_8b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001, 0b0001, (rd), (rn))
Expand Down
6 changes: 6 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,12 @@ extract_r4: dest:f src1:x len:4
extract_r8: dest:f src1:x len:4
arm64_xaddv: dest:x src1:x len:8
xop_ovr_x_x: dest:x src1:x len:4
expand_i1: dest:x src1:i len:4
expand_i2: dest:x src1:i len:4
expand_i4: dest:x src1:i len:4
expand_i8: dest:x src1:i len:4
expand_r4: dest:x src1:f len:4
expand_r8: dest:x src1:f len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
15 changes: 14 additions & 1 deletion src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3746,7 +3746,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
case OP_XCAST:
break;

case OP_EXPAND_I1:
case OP_EXPAND_I2:
case OP_EXPAND_I4:
case OP_EXPAND_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
break;
}
case OP_EXPAND_R4:
case OP_EXPAND_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
break;
}
case OP_EXTRACT_I1:
case OP_EXTRACT_I2:
case OP_EXTRACT_I4:
Expand Down
3 changes: 3 additions & 0 deletions src/mono/mono/mini/simd-arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ SIMD_OP (128, OP_XBINOP, OP_FMAX, WTDSS, _UNDEF,
SIMD_OP (128, OP_XBINOP, OP_IMIN, WTDSS, arm_neon_smin, arm_neon_smin, arm_neon_smin, _SKIP, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XBINOP, OP_IMIN_UN, WTDSS, arm_neon_umin, arm_neon_umin, arm_neon_umin, _SKIP, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XBINOP, OP_FMIN, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fmin, arm_neon_fmin)
SIMD_OP (128, OP_XBINOP, OP_IMUL, WTDSS, arm_neon_mul, arm_neon_mul, arm_neon_mul, arm_neon_mul, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XBINOP, OP_FMUL, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fmul, arm_neon_fmul)
SIMD_OP (128, OP_XBINOP, OP_FDIV, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fdiv, arm_neon_fdiv)
SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_AND, WDSS, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and)
SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_OR, WDSS, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr)
SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_XOR, WDSS, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor)
Expand Down
98 changes: 54 additions & 44 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,28 @@ emit_simd_ins_for_sig (MonoCompile *cfg, MonoClass *klass, int opcode, int instc

static gboolean type_enum_is_unsigned (MonoTypeEnum type);
static gboolean type_enum_is_float (MonoTypeEnum type);
static int type_to_expand_op (MonoTypeEnum type);

static MonoInst*
handle_mul_div_by_scalar (MonoCompile *cfg, MonoClass *klass, MonoTypeEnum arg_type, int scalar_reg, int vector_reg, int sub_op)
{
MonoInst* ins;

if (COMPILE_LLVM (cfg)) {
ins = emit_simd_ins (cfg, klass, OP_CREATE_SCALAR_UNSAFE, scalar_reg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP_BYSCALAR, vector_reg, ins->dreg);
ins->inst_c0 = sub_op;
} else {
ins = emit_simd_ins (cfg, klass, type_to_expand_op (arg_type), scalar_reg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP, vector_reg, ins->dreg);
ins->inst_c0 = sub_op;
ins->inst_c1 = arg_type;
}

return ins;
}

static MonoInst*
emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSignature *fsig, MonoInst **args, MonoTypeEnum arg_type, int id)
Expand Down Expand Up @@ -304,13 +326,9 @@ emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSigna
case SN_op_Division: {
const char *class_name = m_class_get_name (klass);
if (strcmp ("Vector2", class_name) && strcmp ("Vector4", class_name) && strcmp ("Quaternion", class_name) && strcmp ("Plane", class_name)) {
if ((fsig->params [0]->type == MONO_TYPE_GENERICINST) && (fsig->params [1]->type != MONO_TYPE_GENERICINST)) {
MonoInst* ins = emit_simd_ins (cfg, klass, OP_CREATE_SCALAR_UNSAFE, args [1]->dreg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP_BYSCALAR, args [0]->dreg, ins->dreg);
ins->inst_c0 = OP_FDIV;
return ins;
} else if ((fsig->params [0]->type == MONO_TYPE_GENERICINST) && (fsig->params [1]->type == MONO_TYPE_GENERICINST)) {
if ((fsig->params [0]->type == MONO_TYPE_GENERICINST) && (fsig->params [1]->type != MONO_TYPE_GENERICINST))
return handle_mul_div_by_scalar (cfg, klass, arg_type, args [1]->dreg, args [0]->dreg, OP_FDIV);
else if ((fsig->params [0]->type == MONO_TYPE_GENERICINST) && (fsig->params [1]->type == MONO_TYPE_GENERICINST)) {
instc0 = OP_FDIV;
break;
} else {
Expand All @@ -330,19 +348,11 @@ emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSigna
case SN_op_Multiply: {
const char *class_name = m_class_get_name (klass);
if (strcmp ("Vector2", class_name) && strcmp ("Vector4", class_name) && strcmp ("Quaternion", class_name) && strcmp ("Plane", class_name)) {
if (fsig->params [1]->type != MONO_TYPE_GENERICINST) {
MonoInst* ins = emit_simd_ins (cfg, klass, OP_CREATE_SCALAR_UNSAFE, args [1]->dreg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP_BYSCALAR, args [0]->dreg, ins->dreg);
ins->inst_c0 = OP_FMUL;
return ins;
} else if (fsig->params [0]->type != MONO_TYPE_GENERICINST) {
MonoInst* ins = emit_simd_ins (cfg, klass, OP_CREATE_SCALAR_UNSAFE, args [0]->dreg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP_BYSCALAR, args [1]->dreg, ins->dreg);
ins->inst_c0 = OP_FMUL;
return ins;
} else if ((fsig->params [0]->type == MONO_TYPE_GENERICINST) && (fsig->params [1]->type == MONO_TYPE_GENERICINST)) {
if (fsig->params [1]->type != MONO_TYPE_GENERICINST)
return handle_mul_div_by_scalar (cfg, klass, arg_type, args [1]->dreg, args [0]->dreg, OP_FMUL);
else if (fsig->params [0]->type != MONO_TYPE_GENERICINST)
return handle_mul_div_by_scalar (cfg, klass, arg_type, args [0]->dreg, args [1]->dreg, OP_FMUL);
else if ((fsig->params [0]->type == MONO_TYPE_GENERICINST) && (fsig->params [1]->type == MONO_TYPE_GENERICINST)) {
instc0 = OP_FMUL;
break;
} else {
Expand Down Expand Up @@ -375,22 +385,18 @@ emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSigna
instc0 = type_enum_is_unsigned (arg_type) ? OP_IMIN_UN : OP_IMIN;
break;
case SN_Multiply:
case SN_op_Multiply:
if (fsig->params [1]->type != MONO_TYPE_GENERICINST) {
MonoInst* ins = emit_simd_ins (cfg, klass, OP_CREATE_SCALAR_UNSAFE, args [1]->dreg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP_BYSCALAR, args [0]->dreg, ins->dreg);
ins->inst_c0 = OP_IMUL;
return ins;
} else if (fsig->params [0]->type != MONO_TYPE_GENERICINST) {
MonoInst* ins = emit_simd_ins (cfg, klass, OP_CREATE_SCALAR_UNSAFE, args [0]->dreg, -1);
ins->inst_c1 = arg_type;
ins = emit_simd_ins (cfg, klass, OP_XBINOP_BYSCALAR, args [1]->dreg, ins->dreg);
ins->inst_c0 = OP_IMUL;
return ins;
}
case SN_op_Multiply: {
#ifdef TARGET_ARM64
if (!COMPILE_LLVM (cfg) && (arg_type == MONO_TYPE_I8 || arg_type == MONO_TYPE_U8))
return NULL;
#endif
if (fsig->params [1]->type != MONO_TYPE_GENERICINST)
return handle_mul_div_by_scalar (cfg, klass, arg_type, args [1]->dreg, args [0]->dreg, OP_IMUL);
else if (fsig->params [0]->type != MONO_TYPE_GENERICINST)
return handle_mul_div_by_scalar (cfg, klass, arg_type, args [0]->dreg, args [1]->dreg, OP_IMUL);
instc0 = OP_IMUL;
break;
}
case SN_Subtract:
case SN_op_Subtraction:
instc0 = OP_ISUB;
Expand Down Expand Up @@ -799,9 +805,9 @@ type_enum_is_float (MonoTypeEnum type)
}

static int
type_to_expand_op (MonoType *type)
type_to_expand_op (MonoTypeEnum type)
{
switch (type->type) {
switch (type) {
case MONO_TYPE_I1:
case MONO_TYPE_U1:
return OP_EXPAND_I1;
Expand Down Expand Up @@ -1262,6 +1268,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
case SN_ToScalar:
case SN_Floor:
case SN_Ceiling:
case SN_Divide:
case SN_Multiply:
break;
default:
return NULL;
Expand Down Expand Up @@ -1447,7 +1455,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
return NULL;
if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
return emit_simd_ins (cfg, klass, type_to_expand_op (etype), args [0]->dreg, -1);
return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
else if (is_create_from_half_vectors_overload (fsig))
return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg);
else if (is_elementwise_create_overload (fsig, etype))
Expand Down Expand Up @@ -1940,7 +1948,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
#ifdef TARGET_ARM64
if (!COMPILE_LLVM (cfg)) {
return NULL;
/*if (size != 16)
if (size != 16)
return NULL;
switch (id) {
case SN_get_One:
Expand All @@ -1955,10 +1963,12 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
case SN_op_ExclusiveOr:
case SN_op_Equality:
case SN_op_Inequality:
case SN_op_Division:
case SN_op_Multiply:
break;
default:
return NULL;
}*/
}
}
#endif

Expand Down Expand Up @@ -2166,7 +2176,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
gboolean indirect = FALSE;
int dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);

int opcode = type_to_expand_op (etype);
int opcode = type_to_expand_op (etype->type);
ins = emit_simd_ins (cfg, klass, opcode, args [1]->dreg, -1);

for (int i = 1; i < fsig->param_count; ++i) {
Expand Down Expand Up @@ -2639,7 +2649,7 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig
if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) {
int dreg = load_simd_vreg (cfg, cmethod, args [0], NULL);

int opcode = type_to_expand_op (etype);
int opcode = type_to_expand_op (etype->type);
ins = emit_simd_ins (cfg, klass, opcode, args [1]->dreg, -1);
ins->dreg = dreg;
return ins;
Expand Down Expand Up @@ -3408,7 +3418,7 @@ emit_arm64_intrinsics (
break;
}
}
return emit_simd_ins (cfg, ret_klass, type_to_expand_op (rtype), scalar_src_reg, -1);
return emit_simd_ins (cfg, ret_klass, type_to_expand_op (rtype->type), scalar_src_reg, -1);
}
case SN_Extract: {
int extract_op = type_to_xextract_op (arg0_type);
Expand Down Expand Up @@ -3448,7 +3458,7 @@ emit_arm64_intrinsics (
MonoType *etype = get_vector_t_elem_type (fsig->ret);
gboolean is_unsigned = type_is_unsigned (fsig->ret);
gboolean scalar = id == SN_ShiftLeftLogicalSaturateScalar;
int s2v = scalar ? OP_CREATE_SCALAR_UNSAFE : type_to_expand_op (etype);
int s2v = scalar ? OP_CREATE_SCALAR_UNSAFE : type_to_expand_op (etype->type);
int xop = scalar ? OP_XOP_OVR_SCALAR_X_X_X : OP_XOP_OVR_X_X_X;
int iid = is_unsigned ? INTRINS_AARCH64_ADV_SIMD_UQSHL : INTRINS_AARCH64_ADV_SIMD_SQSHL;
MonoInst *shift_vector = emit_simd_ins (cfg, ret_klass, s2v, args [1]->dreg, -1);
Expand Down Expand Up @@ -4851,7 +4861,7 @@ emit_wasm_supported_intrinsics (
case SN_Splat: {
MonoType *etype = get_vector_t_elem_type (fsig->ret);
g_assert (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype));
return emit_simd_ins (cfg, klass, type_to_expand_op (etype), args [0]->dreg, -1);
return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
}
case SN_Dot:
return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, INTRINS_WASM_DOT, -1, fsig, args);
Expand Down