Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify unroll limits in a single entry point #83274

Merged
merged 8 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3124,7 +3124,7 @@ void CodeGen::genLclHeap(GenTree* tree)

if (compiler->info.compInitMem)
{
if (amount <= LCLHEAP_UNROLL_LIMIT)
if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
{
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
// stp xzr, xzr, [sp, #-16]!
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/codegenloongarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2765,7 +2765,7 @@ void CodeGen::genCodeForDivMod(GenTreeOp* tree)
// Generate code for InitBlk by performing a loop unroll
// Preconditions:
// a) Both the size and fill byte value are integer constants.
// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
// b) The size of the struct to initialize is smaller than getUnrollThreshold() bytes.
void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
assert(node->OperIs(GT_STORE_BLK));
Expand Down Expand Up @@ -6457,7 +6457,7 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
// None
//
// Assumption:
// The size argument of the CpBlk node is a constant and <= CPBLK_UNROLL_LIMIT bytes.
// The size argument of the CpBlk node is a constant and <= getUnrollThreshold() bytes.
//
void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
{
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3613,7 +3613,8 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
}

unsigned loadSize = putArgNode->GetArgLoadSize();
assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT));
assert(!src->GetLayout(compiler)->HasGCPtr() &&
(loadSize <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy)));

unsigned offset = 0;
regNumber xmmTmpReg = REG_NA;
Expand Down
47 changes: 47 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8962,6 +8962,53 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#endif // FEATURE_SIMD

public:
enum UnrollKind
{
Memset, // Initializing memory with some value
Memcpy // Copying memory from src to dst
};

unsigned int getUnrollThreshold(UnrollKind type)
{
unsigned threshold = TARGET_POINTER_SIZE;

#if defined(FEATURE_SIMD)
threshold = maxSIMDStructBytes();
#if defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
//
// ldp q0, q1, [x1]
// stp q0, q1, [x0]
//
threshold *= 2;
#elif defined(TARGET_XARCH)
// Ignore AVX-512 for now
threshold = max(threshold, YMM_REGSIZE_BYTES);
#endif
#endif

if (type == UnrollKind::Memset)
{
// Typically, memset-like operations require less instructions than memcpy
threshold *= 2;
}

// Use 4 as a multiplier by default, thus, the final threshold will be:
//
// | arch | memset | memcpy |
// |-------------|--------|--------|
// | x86 avx512 | 512 | 256 | (ignored for now)
// | x86 avx | 256 | 128 |
// | x86 sse | 128 | 64 |
// | arm64 | 256 | 128 | ldp/stp (2x128bit)
// | arm | 32 | 16 | no SIMD support
// | loongarch64 | 64 | 32 | no SIMD support
//
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
//
return threshold * 4;
}

//------------------------------------------------------------------------
// largestEnregisterableStruct: The size in bytes of the largest struct that can be enregistered.
//
Expand Down
26 changes: 3 additions & 23 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,18 +527,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isDstAddrLocal)
{
// Since dstAddr points to the stack CodeGen can use more optimal
// quad-word store SIMD instructions for InitBlock.
initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT;
}
#endif

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT))
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)) &&
src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -608,17 +598,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
}
}

unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isSrcAddrLocal && isDstAddrLocal)
{
// Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal
// quad-word load and store SIMD instructions for CopyBlock.
copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT;
}
#endif

unsigned copyBlockUnrollLimit = comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy);
if (blkNode->OperIs(GT_STORE_OBJ))
{
if (!blkNode->AsObj()->GetLayout()->HasGCPtr())
Expand Down
7 changes: 4 additions & 3 deletions src/coreclr/jit/lowerloongarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) &&
src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -353,7 +354,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
{
blkNode->SetOper(GT_STORE_BLK);
}
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
Expand All @@ -371,7 +372,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT))
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)))
{
if (!src->OperIs(GT_CNS_INT))
{
Expand Down Expand Up @@ -412,7 +412,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}
#ifndef JIT32_GCENCODER
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
else if (dstAddr->OperIsLocalAddr() && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy)))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
Expand Down Expand Up @@ -472,7 +472,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
}
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy)))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -655,7 +655,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
}
else
#endif // TARGET_X86
if (loadSize <= CPBLK_UNROLL_LIMIT)
if (loadSize <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ int LinearScan::BuildNode(GenTree* tree)
// localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);

if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
{
// Need no internal registers
}
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/targetamd64.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
// of contiguous non-gc slots that trigger generating rep movsq instead of
// sequences of movsq instructions
Expand Down
3 changes: 0 additions & 3 deletions src/coreclr/jit/targetarm.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 32 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 16 // Upper bound to let the code generator to loop unroll InitBlk.

#define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog
#define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers
#define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers
Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
#define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)

#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
#define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls
Expand Down
3 changes: 0 additions & 3 deletions src/coreclr/jit/targetloongarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk.

#ifdef FEATURE_SIMD
#pragma error("SIMD Unimplemented yet LOONGARCH")
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/targetx86.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

// TODO-CQ: Fine tune the following xxBlk threshold values:

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
// of contiguous non-gc slots that trigger generating rep movsq instead of
// sequences of movsq instructions
Expand Down