Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the 2-parameter xplat shuffle helpers and accelerating them #68559

Merged
merged 19 commits into from
May 2, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
dec875a
Adding managed definitions for cross-platform shuffle helpers
tannergooding Feb 28, 2022
f6b659f
Adding basic tests covering the Vector64/128/256 Shuffle APIs
tannergooding Mar 14, 2022
39d88ac
Adding JIT support to recognize Vector64/128/256.Shuffle as intrinsic…
tannergooding Apr 16, 2022
b55a154
Adding a helper for determining if a node represents a vector constant
tannergooding Apr 16, 2022
155d9de
Adding x86/x64 acceleration for the 2-parameter xplat shuffle helpers
tannergooding Apr 16, 2022
c9c05a4
Adding Arm64 acceleration for the 2-parameter xplat shuffle helpers
tannergooding Apr 26, 2022
b1a38ac
Ensure a switch covers the "default" case
tannergooding Apr 26, 2022
0f97a0a
Applying formatting patch
tannergooding Apr 26, 2022
903229a
Ensure the call to Op uses 1-based indexing
tannergooding Apr 26, 2022
3e635bd
Ensure TYP_LONG and TYP_ULONG fixup simdBaseJitType and simdBaseType
tannergooding Apr 27, 2022
0ecb386
Have gtNewSimdShuffle use fgMakeMultiUse
tannergooding Apr 28, 2022
54c5da7
Don't pass an unecessary compiler instance to `gtNewSimdShuffleNode`
tannergooding Apr 28, 2022
039f7bb
Don't expose the unused gtNewSimdShuffleNode API
tannergooding Apr 29, 2022
3c41d97
Allow fgMakeMultiUse to take a structType and pass it down to fgInser…
tannergooding Apr 29, 2022
9b5407e
Pass down the clsHnd to fgMakeMultiUse from gtNewSimdShuffleNode
tannergooding Apr 29, 2022
e1b3534
Adding some additional tests covering the vector shuffle APIs
tannergooding Apr 29, 2022
a78e01f
Ensure the Vector256 test is accessing the right index
tannergooding Apr 30, 2022
675e9ee
Ensure we look up the correct clsHnd
tannergooding Apr 30, 2022
88181dd
Applying formatting patch
tannergooding May 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2522,6 +2522,21 @@ class Compiler
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
GenTree* op3,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdSqrtNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic);

Expand Down
344 changes: 344 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21416,6 +21416,350 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type,
#endif // !TARGET_XARCH && !TARGET_ARM64
}

GenTree* Compiler::gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic)
{
assert(IsBaselineSimdIsaSupportedDebugOnly());

assert(varTypeIsSIMD(type));
assert(getSIMDTypeForSize(simdSize) == type);

assert(op1 != nullptr);
assert(op1->TypeIs(type));

assert(op2 != nullptr);
assert(op2->TypeIs(type));
assert(op2->IsVectorConst());

var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));

if (op2->IsVectorAllBitsSet())
{
// AllBitsSet represents indices that are always "out of range" which means zero should be
// selected for every element. We can special-case this down to just returning a zero node
return gtNewSimdZeroNode(type, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false);
}

if (op2->IsVectorZero())
{
// TODO-XARCH-CQ: Zero represents indices that select the first element of op1 each time. We can simplify
// this down to basically a broadcast equivalent.
}

GenTree* retNode = nullptr;
GenTreeIntConCommon* cnsNode = nullptr;

size_t elementSize = genTypeSize(simdBaseType);
size_t elementCount = simdSize / elementSize;

#if defined(TARGET_XARCH)
uint8_t control = 0;
bool crossLane = false;
bool needsZero = varTypeIsSmallInt(simdBaseType);
uint64_t value = 0;
uint8_t vecCns[32] = {};
uint8_t mskCns[32] = {};

for (size_t index = 0; index < elementCount; index++)
{
value = op2->GetIntegralVectorConstElement(index);

if (value < elementCount)
{
if (simdSize == 32)
{
// Most of the 256-bit shuffle/permute instructions operate as if
// the inputs were 2x 128-bit values. If the selected indices cross
// the respective 128-bit "lane" we may need to specialize the codegen

if (index < (elementCount / 2))
{
crossLane |= (value >= (elementCount / 2));
}
else
{
crossLane |= (value < (elementCount / 2));
}
}

// Setting the control for byte/sbyte and short/ushort is unnecessary
// and will actually compute an incorrect control word. But it simplifies
// the overall logic needed here and will remain unused.

control |= (value << (index * (elementCount / 2)));

// When Ssse3 is supported, we may need vecCns to accurately select the relevant
// bytes if some index is outside the valid range. Since x86/x64 is little-endian
// we can simplify this down to a for loop that scales the value and selects count
// sequential bytes.

for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = (uint8_t)((value * elementSize) + i);

// When Ssse3 is not supported, we need to adjust the constant to be AllBitsSet
// so that we can emit a ConditionalSelect(op2, retNode, zeroNode).

mskCns[(index * elementSize) + i] = 0xFF;
}
}
else
{
needsZero = true;

// When Ssse3 is supported, we may need vecCns to accurately select the relevant
// bytes if some index is outside the valid range. We can do this by just zeroing
// out each byte in the element. This only requires the most significant bit to be
// set, but we use 0xFF instead since that will be the equivalent of AllBitsSet

for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = 0xFF;

// When Ssse3 is not supported, we need to adjust the constant to be Zero
// so that we can emit a ConditionalSelect(op2, retNode, zeroNode).

mskCns[(index * elementSize) + i] = 0x00;
}
}
}

if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));

if (varTypeIsSmallInt(simdBaseType))
{
if (crossLane)
{
// TODO-XARCH-CQ: We should emulate cross-lane shuffling for byte/sbyte and short/ushort
unreached();
}

// If we aren't crossing lanes, then we can decompose the byte/sbyte
// and short/ushort operations into 2x 128-bit operations

// We want to build what is essentially the following managed code:
// var op1Lower = op1.GetLower();
// op1Lower = Ssse3.Shuffle(op1Lower, Vector128.Create(...));
//
// var op1Upper = op1.GetUpper();
// op1Upper = Ssse3.Shuffle(op1Upper, Vector128.Create(...));
//
// return Vector256.Create(op1Lower, op1Upper);

simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;

CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType);

GenTree* op1Dup;
op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL,
nullptr DEBUGARG("Clone op1 for vector shuffle"));

GenTree* op1Lower = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize);

IntrinsicNodeBuilder nodeBuilder1(getAllocator(CMK_ASTNode), 16);

for (uint32_t i = 0; i < 16; i++)
{
nodeBuilder1.AddOperand(i, gtNewIconNode(vecCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder1), NI_Vector128_Create, simdBaseJitType, 16);

op1Lower = gtNewSimdHWIntrinsicNode(type, op1Lower, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16,
isSimdAsHWIntrinsic);

GenTree* op1Upper = gtNewSimdHWIntrinsicNode(type, op1Dup, gtNewIconNode(1), NI_AVX_ExtractVector128,
simdBaseJitType, simdSize);

IntrinsicNodeBuilder nodeBuilder2(getAllocator(CMK_ASTNode), 16);

for (uint32_t i = 0; i < 16; i++)
{
nodeBuilder2.AddOperand(i, gtNewIconNode(vecCns[16 + i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder2), NI_Vector128_Create, simdBaseJitType, 16);

op1Upper = gtNewSimdHWIntrinsicNode(type, op1Upper, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16,
isSimdAsHWIntrinsic);

return gtNewSimdHWIntrinsicNode(type, op1Lower, op1Upper, gtNewIconNode(1), NI_AVX_InsertVector128,
simdBaseJitType, simdSize);
}

if (elementSize == 4)
{
IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), elementCount);

for (uint32_t i = 0; i < elementCount; i++)
{
uint8_t value = (uint8_t)(vecCns[i * elementSize] / elementSize);
nodeBuilder.AddOperand(i, gtNewIconNode(value));
}

CorInfoType indicesJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT;

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), NI_Vector256_Create, indicesJitType, simdSize);

// swap the operands to match the encoding requirements
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX2_PermuteVar8x32, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
else
{
assert(elementSize == 8);

cnsNode = gtNewIconNode(control);
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX2_Permute4x64, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
}
else
{
if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
{
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;

IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize);

for (uint32_t i = 0; i < simdSize; i++)
{
nodeBuilder.AddOperand(i, gtNewIconNode(vecCns[i]));
}

op2 =
gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), NI_Vector128_Create, simdBaseJitType, simdSize);

return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSSE3_Shuffle, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}

if (varTypeIsLong(simdBaseType))
{
// TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll
// just utilize the path for TYP_DOUBLE for simplicity. We could alternatively break this
// down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no
// real benefit since shuffle gets its own port rather than using the fp specific ports.

simdBaseJitType = CORINFO_TYPE_DOUBLE;
simdBaseType = TYP_DOUBLE;
}

cnsNode = gtNewIconNode(control);

if (varTypeIsIntegral(simdBaseType))
{
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_SSE2_Shuffle, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX))
{
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX_Permute, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
else
{
CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType);

GenTree* op1Dup;
op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL,
nullptr DEBUGARG("Clone op1 for vector shuffle"));

retNode = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, cnsNode, NI_SSE_Shuffle, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
}

assert(retNode != nullptr);

if (needsZero)
{
assert(!compIsaSupportedDebugOnly(InstructionSet_SSSE3));

IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize);

for (uint32_t i = 0; i < simdSize; i++)
{
nodeBuilder.AddOperand(i, gtNewIconNode(mskCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), NI_Vector128_Create, simdBaseJitType, simdSize);

GenTree* zero = gtNewSimdZeroNode(type, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
retNode = gtNewSimdCndSelNode(type, op2, retNode, zero, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
}

return retNode;
#elif defined(TARGET_ARM64)
uint64_t value = 0;
uint8_t vecCns[16] = {};

for (size_t index = 0; index < elementCount; index++)
{
value = op2->GetIntegralVectorConstElement(index);

if (value < elementCount)
{
for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = (uint8_t)((value * elementSize) + i);
}
}
else
{
for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = 0xFF;
}
}
}

NamedIntrinsic createIntrinsic = NI_Vector64_Create;
NamedIntrinsic lookupIntrinsic = NI_AdvSimd_VectorTableLookup;

if (simdSize == 16)
{
createIntrinsic = NI_Vector128_Create;
lookupIntrinsic = NI_AdvSimd_Arm64_VectorTableLookup;

op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}

IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize);

for (uint32_t i = 0; i < simdSize; i++)
{
nodeBuilder.AddOperand(i, gtNewIconNode(vecCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), createIntrinsic, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);

return gtNewSimdHWIntrinsicNode(type, op1, op2, lookupIntrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
#else
#error Unsupported platform
#endif // !TARGET_XARCH && !TARGET_ARM64
}

GenTree* Compiler::gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
GenTree* op3,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic)
{
return nullptr;
}

GenTree* Compiler::gtNewSimdSqrtNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic)
{
Expand Down
Loading