Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Improve x86 unsigned to floating cast codegen #111595

Merged
merged 5 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1622,11 +1622,7 @@ class CodeGen final : public CodeGenInterface

instruction ins_Copy(var_types dstType);
instruction ins_Copy(regNumber srcReg, var_types dstType);
#if defined(TARGET_XARCH)
instruction ins_FloatConv(var_types to, var_types from, emitAttr attr);
#elif defined(TARGET_ARM)
instruction ins_FloatConv(var_types to, var_types from);
#endif
instruction ins_MathOp(genTreeOps oper, var_types type);

void instGen_Return(unsigned stkArgSize);
Expand Down
94 changes: 36 additions & 58 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7218,10 +7218,9 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
}
else
{
instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType));
instruction ins = ins_FloatConv(dstType, srcType);

// integral to floating-point conversions all have RMW semantics if VEX support
// is not available
// floating-point conversions all have RMW semantics if VEX support is not available

bool isRMW = !compiler->canUseVexEncoding();
inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
Expand All @@ -7247,7 +7246,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
void CodeGen::genIntToFloatCast(GenTree* treeNode)
{
// int type --> float/double conversions are always non-overflow ones
assert(treeNode->OperGet() == GT_CAST);
assert(treeNode->OperIs(GT_CAST));
assert(!treeNode->gtOverflow());

regNumber targetReg = treeNode->GetRegNum();
Expand All @@ -7265,11 +7264,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
var_types srcType = op1->TypeGet();
assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));

#if !defined(TARGET_64BIT)
// We expect morph to replace long to float/double casts with helper calls
noway_assert(!varTypeIsLong(srcType));
#endif // !defined(TARGET_64BIT)

// Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
// ensure srcType of a cast is non gc-type. Codegen should never see BYREF as source type except
// for GT_LCL_ADDR that represent stack addresses and can be considered as TYP_I_IMPL. In all other
Expand All @@ -7278,71 +7272,47 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
// operation.
if (srcType == TYP_BYREF)
{
noway_assert(op1->OperGet() == GT_LCL_ADDR);
noway_assert(op1->OperIs(GT_LCL_ADDR));
srcType = TYP_I_IMPL;
}

// force the srcType to unsigned if GT_UNSIGNED flag is set
if (treeNode->gtFlags & GTF_UNSIGNED)
{
srcType = varTypeToUnsigned(srcType);
}

noway_assert(!varTypeIsGC(srcType));

// We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
// At this point, we should not see a srcType that is not int or long.
// For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
// either the front-end or lowering phase to have generated two levels of cast.
// The first one is for widening smaller int type to int32 and the second one is
// to the float/double.
emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));

// Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
// here since they should have been lowered appropriately.
noway_assert(srcType != TYP_UINT);
assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) || compiler->canUseEvexEncodingDebugOnly());

if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) && compiler->canUseEvexEncoding())
{
assert(compiler->canUseEvexEncodingDebugOnly());
genConsumeOperands(treeNode->AsOp());
instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
genProduceReg(treeNode);
return;
}
// On 32-bit, we expect morph to replace long to float/double casts with helper calls,
// so we should only see int here.
noway_assert(varTypeIsIntOrI(srcType));

// To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
// To convert integral type to floating, the cvt[u]si2ss/sd instruction is used
// which does a partial write to lower 4/8 bytes of xmm register keeping the other
// upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
// upper bytes unmodified. If "cvt[u]si2ss/sd xmmReg, r32/r64" occurs inside a loop,
// the partial write could introduce a false dependency and could cause a stall
// if there are further uses of xmmReg. We have such a case occurring with a
// customer reported version of SpectralNorm benchmark, resulting in 2x perf
// regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
// cvtsi2ss/sd instruction.
// cvt[u]si2ss/sd instruction.

genConsumeOperands(treeNode->AsOp());
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(),
treeNode->GetRegNum(), INS_OPTS_NONE);

// Note that here we need to specify srcType that will determine
// the size of source reg/mem operand and rex.w prefix.
instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType));

// integral to floating-point conversions all have RMW semantics if VEX support
// is not available
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg, INS_OPTS_NONE);

const bool isRMW = !compiler->canUseVexEncoding();
// force the srcType to unsigned if GT_UNSIGNED flag is set
if (treeNode->IsUnsigned())
{
srcType = varTypeToUnsigned(srcType);
}

// Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
// will interpret ULONG value as LONG. Hence we need to adjust the
// result if sign-bit of srcType is set.
if (srcType == TYP_ULONG)
if (srcType == TYP_ULONG && !compiler->canUseEvexEncoding())
{
assert(dstType == TYP_DOUBLE);
assert(op1->isUsedFromReg());

// If we don't have the EVEX unsigned conversion instructions, use the signed
// long -> float/double conversion instruction instead and fix up the result.

instruction convIns = ins_FloatConv(dstType, TYP_LONG);
instruction addIns = (dstType == TYP_FLOAT) ? INS_addss : INS_addsd;

// The following LONG->DOUBLE cast machinery is based on clang's implementation
// with "-ffp-model=strict" flag:
//
Expand All @@ -7369,15 +7339,23 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
GetEmitter()->emitIns_R_R(INS_or, EA_8BYTE, tmpReg2, tmpReg1);
GetEmitter()->emitIns_R_R(INS_test, EA_8BYTE, argReg, argReg);
GetEmitter()->emitIns_R_R(INS_cmovns, EA_8BYTE, tmpReg2, argReg);
GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, tmpReg2);
GetEmitter()->emitIns_R_R(convIns, EA_8BYTE, targetReg, tmpReg2);

BasicBlock* label = genCreateTempLabel();
inst_JMP(EJ_jns, label);
GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg);
GetEmitter()->emitIns_R_R(addIns, EA_ATTR(genTypeSize(dstType)), targetReg, targetReg);
genDefineTempLabel(label);
}
else
{
assert(varTypeIsIntOrI(srcType) || compiler->canUseEvexEncodingDebugOnly());

instruction ins = ins_FloatConv(dstType, srcType);

// integral to floating-point conversions all have RMW semantics if VEX support
// is not available

const bool isRMW = !compiler->canUseVexEncoding();
inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
}
genProduceReg(treeNode);
Expand All @@ -7403,7 +7381,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
{
// we don't expect to see overflow detecting float/double --> int type conversions here
// as they should have been converted into helper calls by front-end.
assert(treeNode->OperGet() == GT_CAST);
assert(treeNode->OperIs(GT_CAST));
assert(!treeNode->gtOverflow());

regNumber targetReg = treeNode->GetRegNum();
Expand Down Expand Up @@ -7446,7 +7424,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
// Note that we need to specify dstType here so that it will determine
// the size of destination integer register and also the rex.w prefix.
genConsumeOperands(treeNode->AsOp());
instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
instruction ins = ins_FloatConv(dstType, srcType);
GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
genProduceReg(treeNode);
}
Expand Down
75 changes: 36 additions & 39 deletions src/coreclr/jit/instr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2414,49 +2414,57 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
// Arguments:
// to - Destination type.
// from - Source type.
// attr - Input size.
//
// Returns:
// The correct conversion instruction to use based on src and dst types.
//
instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
instruction CodeGen::ins_FloatConv(var_types to, var_types from)
{
// AVX: Supports following conversions
// srcType = int16/int64 castToType = float
// AVX512: Supports following conversions
// srcType = ulong castToType = double/float

switch (from)
{
// int/long -> float/double use the same instruction but type size would be different.
case TYP_INT:
switch (to)
{
case TYP_FLOAT:
return INS_cvtsi2ss32;
case TYP_DOUBLE:
return INS_cvtsi2sd32;
default:
unreached();
}
break;

case TYP_LONG:
switch (to)
{
case TYP_FLOAT:
{
if (EA_SIZE(attr) == EA_4BYTE)
{
return INS_cvtsi2ss32;
}
else if (EA_SIZE(attr) == EA_8BYTE)
{
return INS_cvtsi2ss64;
}
return INS_cvtsi2ss64;
case TYP_DOUBLE:
return INS_cvtsi2sd64;
default:
unreached();
}
}
break;

case TYP_UINT:
switch (to)
{
case TYP_FLOAT:
return INS_vcvtusi2ss32;
case TYP_DOUBLE:
{
if (EA_SIZE(attr) == EA_4BYTE)
{
return INS_cvtsi2sd32;
}
else if (EA_SIZE(attr) == EA_8BYTE)
{
return INS_cvtsi2sd64;
}
return INS_vcvtusi2sd32;
default:
unreached();
}
}
break;

case TYP_ULONG:
switch (to)
{
case TYP_FLOAT:
return INS_vcvtusi2ss64;
case TYP_DOUBLE:
return INS_vcvtusi2sd64;
default:
unreached();
}
Expand Down Expand Up @@ -2502,17 +2510,6 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
}
break;

case TYP_ULONG:
switch (to)
{
case TYP_DOUBLE:
return INS_vcvtusi2sd64;
case TYP_FLOAT:
return INS_vcvtusi2ss64;
default:
unreached();
}

default:
unreached();
}
Expand Down
35 changes: 9 additions & 26 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,9 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
* GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
* GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
*
* SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
* Unless the EVEX conversion instructions are available, casts from Uint32
* are morphed as follows by front-end and hence should not be seen here.
* GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
* GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
*
*
* Similarly casts from float/double to a smaller int type are transformed as follows:
Expand All @@ -812,24 +811,14 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
* GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
* GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
*
* SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
* integer. The above transformations help us to leverage those instructions.
*
* Note that for the following conversions we still depend on helper calls and
* don't expect to see them here.
* i) GT_CAST(float/double, uint64)
* i) GT_CAST(float/double, uint64) when EVEX is not available
* ii) GT_CAST(float/double, int type with overflow detection)
*
* TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
* There are hardly any occurrences of this conversion operation in platform
* assemblies or in CQ perf benchmarks (1 occurrence in corelib, microsoft.jscript,
* 1 occurrence in Roslyn and no occurrences in system, system.core, system.numerics
* system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
* doing this optimization is a win, should consider generating in-lined code.
*/
GenTree* Lowering::LowerCast(GenTree* tree)
{
assert(tree->OperGet() == GT_CAST);
assert(tree->OperIs(GT_CAST));

GenTree* castOp = tree->AsCast()->CastOp();
var_types castToType = tree->CastToType();
Expand All @@ -838,33 +827,27 @@ GenTree* Lowering::LowerCast(GenTree* tree)
var_types tmpType = TYP_UNDEF;

// force the srcType to unsigned if GT_UNSIGNED flag is set
if (tree->gtFlags & GTF_UNSIGNED)
if (tree->IsUnsigned())
{
srcType = varTypeToUnsigned(srcType);
}

// We should never see the following casts as they are expected to be lowered
// appropriately or converted into helper calls by front-end.
// We should not see the following casts unless directly supported by hardware,
// as they are expected to be lowered appropriately or converted into helper calls by front-end.
// srcType = float/double castToType = * and overflow detecting cast
// Reason: must be converted to a helper call
// srcType = float/double, castToType = ulong
// Reason: must be converted to a helper call
// srcType = uint castToType = float/double
// Reason: uint -> float/double = uint -> long -> float/double
// srcType = ulong castToType = float
// Reason: ulong -> float = ulong -> double -> float
if (varTypeIsFloating(srcType))
{
noway_assert(!tree->gtOverflow());
assert(castToType != TYP_ULONG || comp->canUseEvexEncoding());
assert(castToType != TYP_ULONG || comp->canUseEvexEncodingDebugOnly());
}
else if (srcType == TYP_UINT)
{
noway_assert(!varTypeIsFloating(castToType));
}
else if (srcType == TYP_ULONG)
{
assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding());
assert(castToType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly());
}

#if defined(TARGET_AMD64)
Expand Down Expand Up @@ -984,7 +967,7 @@ GenTree* Lowering::LowerCast(GenTree* tree)
BlockRange().InsertAfter(newCast, newTree);
LowerNode(newTree);

// usage 2 --> use thecompared mask with input value and max value to blend
// usage 2 --> use the compared mask with input value and max value to blend
GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
BlockRange().InsertAfter(newTree, control);
GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree,
Expand Down
Loading
Loading