Skip to content

Commit

Permalink
Adding the 2-parameter xplat shuffle helpers and accelerating them (#…
Browse files Browse the repository at this point in the history
…68559)

* Adding managed definitions for cross-platform shuffle helpers

* Adding basic tests covering the Vector64/128/256 Shuffle APIs

* Adding JIT support to recognize Vector64/128/256.Shuffle as intrinsic but not handle it as an intrinsic

* Adding a helper for determining if a node represents a vector constant

* Adding x86/x64 acceleration for the 2-parameter xplat shuffle helpers

* Adding Arm64 acceleration for the 2-parameter xplat shuffle helpers

* Ensure a switch covers the "default" case

* Applying formatting patch

* Ensure the call to Op uses 1-based indexing

* Ensure TYP_LONG and TYP_ULONG fixup simdBaseJitType and simdBaseType

* Have gtNewSimdShuffle use fgMakeMultiUse

* Don't pass an unecessary compiler instance to `gtNewSimdShuffleNode`

* Don't expose the unused gtNewSimdShuffleNode API

* Allow fgMakeMultiUse to take a structType and pass it down to fgInsertCommaFormTemp

* Pass down the clsHnd to fgMakeMultiUse from gtNewSimdShuffleNode

* Adding some additional tests covering the vector shuffle APIs

* Ensure the Vector256 test is accessing the right index

* Ensure we look up the correct clsHnd

* Applying formatting patch
  • Loading branch information
tannergooding authored May 2, 2022
1 parent 809dd7c commit 23283ff
Show file tree
Hide file tree
Showing 15 changed files with 3,125 additions and 8 deletions.
30 changes: 29 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2522,6 +2522,13 @@ class Compiler
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdSqrtNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic);

Expand Down Expand Up @@ -5495,7 +5502,7 @@ class Compiler
// Create a new temporary variable to hold the result of *ppTree,
// and transform the graph accordingly.
GenTree* fgInsertCommaFormTemp(GenTree** ppTree, CORINFO_CLASS_HANDLE structType = nullptr);
GenTree* fgMakeMultiUse(GenTree** ppTree);
GenTree* fgMakeMultiUse(GenTree** ppTree, CORINFO_CLASS_HANDLE structType = nullptr);

private:
// Recognize a bitwise rotation pattern and convert into a GT_ROL or a GT_ROR node.
Expand Down Expand Up @@ -8261,6 +8268,27 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
return NO_CLASS_HANDLE;
}

#if defined(FEATURE_HW_INTRINSICS)
CORINFO_CLASS_HANDLE gtGetStructHandleForSimdOrHW(var_types simdType,
CorInfoType simdBaseJitType,
bool isSimdAsHWIntrinsic = false)
{
CORINFO_CLASS_HANDLE clsHnd = NO_CLASS_HANDLE;

if (isSimdAsHWIntrinsic)
{
clsHnd = gtGetStructHandleForSIMD(simdType, simdBaseJitType);
}
else
{
clsHnd = gtGetStructHandleForHWSIMD(simdType, simdBaseJitType);
}

assert(clsHnd != NO_CLASS_HANDLE);
return clsHnd;
}
#endif // FEATURE_HW_INTRINSICS

// Returns true if this is a SIMD type that should be considered an opaque
// vector type (i.e. do not analyze or promote its fields).
// Note that all but the fixed vector types are opaque, even though they may
Expand Down
332 changes: 332 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21335,6 +21335,338 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type,
#endif // !TARGET_XARCH && !TARGET_ARM64
}

GenTree* Compiler::gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic)
{
assert(IsBaselineSimdIsaSupportedDebugOnly());

assert(varTypeIsSIMD(type));
assert(getSIMDTypeForSize(simdSize) == type);

assert(op1 != nullptr);
assert(op1->TypeIs(type));

assert(op2 != nullptr);
assert(op2->TypeIs(type));
assert(op2->IsVectorConst());

var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));

if (op2->IsVectorAllBitsSet())
{
// AllBitsSet represents indices that are always "out of range" which means zero should be
// selected for every element. We can special-case this down to just returning a zero node
return gtNewSimdZeroNode(type, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false);
}

if (op2->IsVectorZero())
{
// TODO-XARCH-CQ: Zero represents indices that select the first element of op1 each time. We can simplify
// this down to basically a broadcast equivalent.
}

GenTree* retNode = nullptr;
GenTreeIntConCommon* cnsNode = nullptr;

size_t elementSize = genTypeSize(simdBaseType);
size_t elementCount = simdSize / elementSize;

#if defined(TARGET_XARCH)
uint8_t control = 0;
bool crossLane = false;
bool needsZero = varTypeIsSmallInt(simdBaseType);
uint64_t value = 0;
uint8_t vecCns[32] = {};
uint8_t mskCns[32] = {};

for (size_t index = 0; index < elementCount; index++)
{
value = op2->GetIntegralVectorConstElement(index);

if (value < elementCount)
{
if (simdSize == 32)
{
// Most of the 256-bit shuffle/permute instructions operate as if
// the inputs were 2x 128-bit values. If the selected indices cross
// the respective 128-bit "lane" we may need to specialize the codegen

if (index < (elementCount / 2))
{
crossLane |= (value >= (elementCount / 2));
}
else
{
crossLane |= (value < (elementCount / 2));
}
}

// Setting the control for byte/sbyte and short/ushort is unnecessary
// and will actually compute an incorrect control word. But it simplifies
// the overall logic needed here and will remain unused.

control |= (value << (index * (elementCount / 2)));

// When Ssse3 is supported, we may need vecCns to accurately select the relevant
// bytes if some index is outside the valid range. Since x86/x64 is little-endian
// we can simplify this down to a for loop that scales the value and selects count
// sequential bytes.

for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = (uint8_t)((value * elementSize) + i);

// When Ssse3 is not supported, we need to adjust the constant to be AllBitsSet
// so that we can emit a ConditionalSelect(op2, retNode, zeroNode).

mskCns[(index * elementSize) + i] = 0xFF;
}
}
else
{
needsZero = true;

// When Ssse3 is supported, we may need vecCns to accurately select the relevant
// bytes if some index is outside the valid range. We can do this by just zeroing
// out each byte in the element. This only requires the most significant bit to be
// set, but we use 0xFF instead since that will be the equivalent of AllBitsSet

for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = 0xFF;

// When Ssse3 is not supported, we need to adjust the constant to be Zero
// so that we can emit a ConditionalSelect(op2, retNode, zeroNode).

mskCns[(index * elementSize) + i] = 0x00;
}
}
}

if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));

if (varTypeIsSmallInt(simdBaseType))
{
if (crossLane)
{
// TODO-XARCH-CQ: We should emulate cross-lane shuffling for byte/sbyte and short/ushort
unreached();
}

// If we aren't crossing lanes, then we can decompose the byte/sbyte
// and short/ushort operations into 2x 128-bit operations

CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSimdOrHW(type, simdBaseJitType, isSimdAsHWIntrinsic);

// We want to build what is essentially the following managed code:
// var op1Lower = op1.GetLower();
// op1Lower = Ssse3.Shuffle(op1Lower, Vector128.Create(...));
//
// var op1Upper = op1.GetUpper();
// op1Upper = Ssse3.Shuffle(op1Upper, Vector128.Create(...));
//
// return Vector256.Create(op1Lower, op1Upper);

simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;

GenTree* op1Dup = fgMakeMultiUse(&op1, clsHnd);
GenTree* op1Lower = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);

IntrinsicNodeBuilder nodeBuilder1(getAllocator(CMK_ASTNode), 16);

for (uint32_t i = 0; i < 16; i++)
{
nodeBuilder1.AddOperand(i, gtNewIconNode(vecCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder1), NI_Vector128_Create, simdBaseJitType, 16,
isSimdAsHWIntrinsic);

op1Lower = gtNewSimdHWIntrinsicNode(type, op1Lower, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16,
isSimdAsHWIntrinsic);

GenTree* op1Upper = gtNewSimdHWIntrinsicNode(type, op1Dup, gtNewIconNode(1), NI_AVX_ExtractVector128,
simdBaseJitType, simdSize, isSimdAsHWIntrinsic);

IntrinsicNodeBuilder nodeBuilder2(getAllocator(CMK_ASTNode), 16);

for (uint32_t i = 0; i < 16; i++)
{
nodeBuilder2.AddOperand(i, gtNewIconNode(vecCns[16 + i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder2), NI_Vector128_Create, simdBaseJitType, 16,
isSimdAsHWIntrinsic);

op1Upper = gtNewSimdHWIntrinsicNode(type, op1Upper, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16,
isSimdAsHWIntrinsic);

return gtNewSimdHWIntrinsicNode(type, op1Lower, op1Upper, gtNewIconNode(1), NI_AVX_InsertVector128,
simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
}

if (elementSize == 4)
{
IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), elementCount);

for (uint32_t i = 0; i < elementCount; i++)
{
uint8_t value = (uint8_t)(vecCns[i * elementSize] / elementSize);
nodeBuilder.AddOperand(i, gtNewIconNode(value));
}

CorInfoType indicesJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT;

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), NI_Vector256_Create, indicesJitType, simdSize,
isSimdAsHWIntrinsic);

// swap the operands to match the encoding requirements
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX2_PermuteVar8x32, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
else
{
assert(elementSize == 8);

cnsNode = gtNewIconNode(control);
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX2_Permute4x64, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
}
else
{
if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
{
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;

IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize);

for (uint32_t i = 0; i < simdSize; i++)
{
nodeBuilder.AddOperand(i, gtNewIconNode(vecCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), NI_Vector128_Create, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);

return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSSE3_Shuffle, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}

if (varTypeIsLong(simdBaseType))
{
// TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll
// just utilize the path for TYP_DOUBLE for simplicity. We could alternatively break this
// down into a TYP_INT or TYP_UINT based shuffle, but that's additional complexity for no
// real benefit since shuffle gets its own port rather than using the fp specific ports.

simdBaseJitType = CORINFO_TYPE_DOUBLE;
simdBaseType = TYP_DOUBLE;
}

cnsNode = gtNewIconNode(control);

if (varTypeIsIntegral(simdBaseType))
{
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_SSE2_Shuffle, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX))
{
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX_Permute, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
else
{
CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSimdOrHW(type, simdBaseJitType, isSimdAsHWIntrinsic);

GenTree* op1Dup = fgMakeMultiUse(&op1, clsHnd);
retNode = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, cnsNode, NI_SSE_Shuffle, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}
}

assert(retNode != nullptr);

if (needsZero)
{
assert(!compIsaSupportedDebugOnly(InstructionSet_SSSE3));

IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize);

for (uint32_t i = 0; i < simdSize; i++)
{
nodeBuilder.AddOperand(i, gtNewIconNode(mskCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), NI_Vector128_Create, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);

GenTree* zero = gtNewSimdZeroNode(type, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
retNode = gtNewSimdCndSelNode(type, op2, retNode, zero, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
}

return retNode;
#elif defined(TARGET_ARM64)
uint64_t value = 0;
uint8_t vecCns[16] = {};

for (size_t index = 0; index < elementCount; index++)
{
value = op2->GetIntegralVectorConstElement(index);

if (value < elementCount)
{
for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = (uint8_t)((value * elementSize) + i);
}
}
else
{
for (uint32_t i = 0; i < elementSize; i++)
{
vecCns[(index * elementSize) + i] = 0xFF;
}
}
}

NamedIntrinsic createIntrinsic = NI_Vector64_Create;
NamedIntrinsic lookupIntrinsic = NI_AdvSimd_VectorTableLookup;

if (simdSize == 16)
{
createIntrinsic = NI_Vector128_Create;
lookupIntrinsic = NI_AdvSimd_Arm64_VectorTableLookup;

op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);
}

IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize);

for (uint32_t i = 0; i < simdSize; i++)
{
nodeBuilder.AddOperand(i, gtNewIconNode(vecCns[i]));
}

op2 = gtNewSimdHWIntrinsicNode(type, std::move(nodeBuilder), createIntrinsic, simdBaseJitType, simdSize,
isSimdAsHWIntrinsic);

return gtNewSimdHWIntrinsicNode(type, op1, op2, lookupIntrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
#else
#error Unsupported platform
#endif // !TARGET_XARCH && !TARGET_ARM64
}

GenTree* Compiler::gtNewSimdSqrtNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic)
{
Expand Down
Loading

0 comments on commit 23283ff

Please sign in to comment.