Skip to content

Commit

Permalink
JIT: Improve x86 unsigned to floating cast codegen (#111595)
Browse files Browse the repository at this point in the history
* improve x86 integral to floating cast codegen

* more cleanup

* more cleanup
  • Loading branch information
saucecontrol authored Jan 31, 2025
1 parent d9ab716 commit fa0f65c
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 204 deletions.
4 changes: 0 additions & 4 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1623,11 +1623,7 @@ class CodeGen final : public CodeGenInterface

instruction ins_Copy(var_types dstType);
instruction ins_Copy(regNumber srcReg, var_types dstType);
#if defined(TARGET_XARCH)
instruction ins_FloatConv(var_types to, var_types from, emitAttr attr);
#elif defined(TARGET_ARM)
instruction ins_FloatConv(var_types to, var_types from);
#endif
instruction ins_MathOp(genTreeOps oper, var_types type);

void instGen_Return(unsigned stkArgSize);
Expand Down
94 changes: 36 additions & 58 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7242,10 +7242,9 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
}
else
{
instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType));
instruction ins = ins_FloatConv(dstType, srcType);

// integral to floating-point conversions all have RMW semantics if VEX support
// is not available
// floating-point conversions all have RMW semantics if VEX support is not available

bool isRMW = !compiler->canUseVexEncoding();
inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
Expand All @@ -7271,7 +7270,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
void CodeGen::genIntToFloatCast(GenTree* treeNode)
{
// int type --> float/double conversions are always non-overflow ones
assert(treeNode->OperGet() == GT_CAST);
assert(treeNode->OperIs(GT_CAST));
assert(!treeNode->gtOverflow());

regNumber targetReg = treeNode->GetRegNum();
Expand All @@ -7289,11 +7288,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
var_types srcType = op1->TypeGet();
assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));

#if !defined(TARGET_64BIT)
// We expect morph to replace long to float/double casts with helper calls
noway_assert(!varTypeIsLong(srcType));
#endif // !defined(TARGET_64BIT)

// Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
// ensure srcType of a cast is non gc-type. Codegen should never see BYREF as source type except
// for GT_LCL_ADDR that represent stack addresses and can be considered as TYP_I_IMPL. In all other
Expand All @@ -7302,71 +7296,47 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
// operation.
if (srcType == TYP_BYREF)
{
noway_assert(op1->OperGet() == GT_LCL_ADDR);
noway_assert(op1->OperIs(GT_LCL_ADDR));
srcType = TYP_I_IMPL;
}

// force the srcType to unsigned if GT_UNSIGNED flag is set
if (treeNode->gtFlags & GTF_UNSIGNED)
{
srcType = varTypeToUnsigned(srcType);
}

noway_assert(!varTypeIsGC(srcType));

// We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
// At this point, we should not see a srcType that is not int or long.
// For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
// either the front-end or lowering phase to have generated two levels of cast.
// The first one is for widening smaller int type to int32 and the second one is
// to the float/double.
emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));

// Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
// here since they should have been lowered appropriately.
noway_assert(srcType != TYP_UINT);
assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) || compiler->canUseEvexEncodingDebugOnly());

if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) && compiler->canUseEvexEncoding())
{
assert(compiler->canUseEvexEncodingDebugOnly());
genConsumeOperands(treeNode->AsOp());
instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
genProduceReg(treeNode);
return;
}
// On 32-bit, we expect morph to replace long to float/double casts with helper calls,
// so we should only see int here.
noway_assert(varTypeIsIntOrI(srcType));

// To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
// To convert integral type to floating, the cvt[u]si2ss/sd instruction is used
// which does a partial write to lower 4/8 bytes of xmm register keeping the other
// upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
// upper bytes unmodified. If "cvt[u]si2ss/sd xmmReg, r32/r64" occurs inside a loop,
// the partial write could introduce a false dependency and could cause a stall
// if there are further uses of xmmReg. We have such a case occurring with a
// customer reported version of SpectralNorm benchmark, resulting in 2x perf
// regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
// cvtsi2ss/sd instruction.
// cvt[u]si2ss/sd instruction.

genConsumeOperands(treeNode->AsOp());
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(),
treeNode->GetRegNum(), INS_OPTS_NONE);

// Note that here we need to specify srcType that will determine
// the size of source reg/mem operand and rex.w prefix.
instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType));

// integral to floating-point conversions all have RMW semantics if VEX support
// is not available
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg, INS_OPTS_NONE);

const bool isRMW = !compiler->canUseVexEncoding();
// force the srcType to unsigned if GT_UNSIGNED flag is set
if (treeNode->IsUnsigned())
{
srcType = varTypeToUnsigned(srcType);
}

// Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
// will interpret ULONG value as LONG. Hence we need to adjust the
// result if sign-bit of srcType is set.
if (srcType == TYP_ULONG)
if (srcType == TYP_ULONG && !compiler->canUseEvexEncoding())
{
assert(dstType == TYP_DOUBLE);
assert(op1->isUsedFromReg());

// If we don't have the EVEX unsigned conversion instructions, use the signed
// long -> float/double conversion instruction instead and fix up the result.

instruction convIns = ins_FloatConv(dstType, TYP_LONG);
instruction addIns = (dstType == TYP_FLOAT) ? INS_addss : INS_addsd;

// The following LONG->DOUBLE cast machinery is based on clang's implementation
// with "-ffp-model=strict" flag:
//
Expand All @@ -7393,15 +7363,23 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
GetEmitter()->emitIns_R_R(INS_or, EA_8BYTE, tmpReg2, tmpReg1);
GetEmitter()->emitIns_R_R(INS_test, EA_8BYTE, argReg, argReg);
GetEmitter()->emitIns_R_R(INS_cmovns, EA_8BYTE, tmpReg2, argReg);
GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, tmpReg2);
GetEmitter()->emitIns_R_R(convIns, EA_8BYTE, targetReg, tmpReg2);

BasicBlock* label = genCreateTempLabel();
inst_JMP(EJ_jns, label);
GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg);
GetEmitter()->emitIns_R_R(addIns, EA_ATTR(genTypeSize(dstType)), targetReg, targetReg);
genDefineTempLabel(label);
}
else
{
assert(varTypeIsIntOrI(srcType) || compiler->canUseEvexEncodingDebugOnly());

instruction ins = ins_FloatConv(dstType, srcType);

// integral to floating-point conversions all have RMW semantics if VEX support
// is not available

const bool isRMW = !compiler->canUseVexEncoding();
inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
}
genProduceReg(treeNode);
Expand All @@ -7427,7 +7405,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
{
// we don't expect to see overflow detecting float/double --> int type conversions here
// as they should have been converted into helper calls by front-end.
assert(treeNode->OperGet() == GT_CAST);
assert(treeNode->OperIs(GT_CAST));
assert(!treeNode->gtOverflow());

regNumber targetReg = treeNode->GetRegNum();
Expand Down Expand Up @@ -7470,7 +7448,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
// Note that we need to specify dstType here so that it will determine
// the size of destination integer register and also the rex.w prefix.
genConsumeOperands(treeNode->AsOp());
instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
instruction ins = ins_FloatConv(dstType, srcType);
GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
genProduceReg(treeNode);
}
Expand Down
75 changes: 36 additions & 39 deletions src/coreclr/jit/instr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2414,49 +2414,57 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
// Arguments:
// to - Destination type.
// from - Source type.
// attr - Input size.
//
// Returns:
// The correct conversion instruction to use based on src and dst types.
//
instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
instruction CodeGen::ins_FloatConv(var_types to, var_types from)
{
// AVX: Supports following conversions
// srcType = int16/int64 castToType = float
// AVX512: Supports following conversions
// srcType = ulong castToType = double/float

switch (from)
{
// int/long -> float/double use the same instruction but type size would be different.
case TYP_INT:
switch (to)
{
case TYP_FLOAT:
return INS_cvtsi2ss32;
case TYP_DOUBLE:
return INS_cvtsi2sd32;
default:
unreached();
}
break;

case TYP_LONG:
switch (to)
{
case TYP_FLOAT:
{
if (EA_SIZE(attr) == EA_4BYTE)
{
return INS_cvtsi2ss32;
}
else if (EA_SIZE(attr) == EA_8BYTE)
{
return INS_cvtsi2ss64;
}
return INS_cvtsi2ss64;
case TYP_DOUBLE:
return INS_cvtsi2sd64;
default:
unreached();
}
}
break;

case TYP_UINT:
switch (to)
{
case TYP_FLOAT:
return INS_vcvtusi2ss32;
case TYP_DOUBLE:
{
if (EA_SIZE(attr) == EA_4BYTE)
{
return INS_cvtsi2sd32;
}
else if (EA_SIZE(attr) == EA_8BYTE)
{
return INS_cvtsi2sd64;
}
return INS_vcvtusi2sd32;
default:
unreached();
}
}
break;

case TYP_ULONG:
switch (to)
{
case TYP_FLOAT:
return INS_vcvtusi2ss64;
case TYP_DOUBLE:
return INS_vcvtusi2sd64;
default:
unreached();
}
Expand Down Expand Up @@ -2502,17 +2510,6 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
}
break;

case TYP_ULONG:
switch (to)
{
case TYP_DOUBLE:
return INS_vcvtusi2sd64;
case TYP_FLOAT:
return INS_vcvtusi2ss64;
default:
unreached();
}

default:
unreached();
}
Expand Down
35 changes: 9 additions & 26 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,9 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
* GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
* GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
*
* SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
* Unless the EVEX conversion instructions are available, casts from Uint32
* are morphed as follows by front-end and hence should not be seen here.
* GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
* GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
*
*
* Similarly casts from float/double to a smaller int type are transformed as follows:
Expand All @@ -812,24 +811,14 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
* GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
* GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
*
* SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
* integer. The above transformations help us to leverage those instructions.
*
* Note that for the following conversions we still depend on helper calls and
* don't expect to see them here.
* i) GT_CAST(float/double, uint64)
* i) GT_CAST(float/double, uint64) when EVEX is not available
* ii) GT_CAST(float/double, int type with overflow detection)
*
* TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
* There are hardly any occurrences of this conversion operation in platform
* assemblies or in CQ perf benchmarks (1 occurrence in corelib, microsoft.jscript,
* 1 occurrence in Roslyn and no occurrences in system, system.core, system.numerics
* system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
* doing this optimization is a win, should consider generating in-lined code.
*/
GenTree* Lowering::LowerCast(GenTree* tree)
{
assert(tree->OperGet() == GT_CAST);
assert(tree->OperIs(GT_CAST));

GenTree* castOp = tree->AsCast()->CastOp();
var_types castToType = tree->CastToType();
Expand All @@ -838,33 +827,27 @@ GenTree* Lowering::LowerCast(GenTree* tree)
var_types tmpType = TYP_UNDEF;

// force the srcType to unsigned if GT_UNSIGNED flag is set
if (tree->gtFlags & GTF_UNSIGNED)
if (tree->IsUnsigned())
{
srcType = varTypeToUnsigned(srcType);
}

// We should never see the following casts as they are expected to be lowered
// appropriately or converted into helper calls by front-end.
// We should not see the following casts unless directly supported by hardware,
// as they are expected to be lowered appropriately or converted into helper calls by front-end.
// srcType = float/double castToType = * and overflow detecting cast
// Reason: must be converted to a helper call
// srcType = float/double, castToType = ulong
// Reason: must be converted to a helper call
// srcType = uint castToType = float/double
// Reason: uint -> float/double = uint -> long -> float/double
// srcType = ulong castToType = float
// Reason: ulong -> float = ulong -> double -> float
if (varTypeIsFloating(srcType))
{
noway_assert(!tree->gtOverflow());
assert(castToType != TYP_ULONG || comp->canUseEvexEncoding());
assert(castToType != TYP_ULONG || comp->canUseEvexEncodingDebugOnly());
}
else if (srcType == TYP_UINT)
{
noway_assert(!varTypeIsFloating(castToType));
}
else if (srcType == TYP_ULONG)
{
assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding());
assert(castToType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly());
}

#if defined(TARGET_AMD64)
Expand Down Expand Up @@ -984,7 +967,7 @@ GenTree* Lowering::LowerCast(GenTree* tree)
BlockRange().InsertAfter(newCast, newTree);
LowerNode(newTree);

// usage 2 --> use thecompared mask with input value and max value to blend
// usage 2 --> use the compared mask with input value and max value to blend
GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
BlockRange().InsertAfter(newTree, control);
GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree,
Expand Down
Loading

0 comments on commit fa0f65c

Please sign in to comment.