From f7ff1244fdcd8c795ea50abf8b89f4ef7adfe366 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Sat, 14 May 2016 13:06:57 -0700 Subject: [PATCH] ARM64: Enable End-To-End ReadyToRun (R2R) Crossgen Fixes https://github.com/dotnet/coreclr/issues/4649 The immediate issues was NYI on genEmitHelperCalls. The initial implementation for the missing part was enough to just crossgen System.dll. But running tests revealed various issues in crossgened binaries (R2R). Most common user/helper calls in R2R are represented as indirect calls similar to interface call using virtual stub dispatch cell -- thunk/helper needs a indirect cell address to update the final target address on the data location. `IsDelayLoadHelper` and `IsLazyHelper` belong to this case. Instead of passing such parameter, x64/x86 uses an encoding trick -- it assumes the call is dispatched like `call [addr]`. So from the return address, runtime could extract indirect cell address. Unfortunately this is not an option for arm64 (actually arm as well but I haven't fixed it in this change) where indirect call on memory is not encodable. So, I made the following changes: 1. For the call requiring that needs to pass indirect cell address, I tagged the call tree via `setR2RRelativeIndir`. Tried to be comprehensive, but I may miss something. Currently, it includes a regular call and various helpers for (virtual) load function pointer/static data access, etc. Hopely we change JIT/EE interface somehow that gives us such explicit information. 2. Use the X11 to record indirect cell address for such call tree in lower similar to VSD. 3. Fixed encodings `ZapIndirectHelperThunk`. In particular the immediate value/offset for `ldr` should be scaled down 4 times since HW will scale it 4 times. 4. Implement `genEmitHelperCalls` for indirect case. This is not the case requiring indirect cell address. This is the case we inlined the indirect helper thunk for the speed. I'm seeing the case for size opt helper call, we invoke a direct call to such thunk which actually uses x12 to dispatch the final target. Likewise, I used x12 for this expansion which seems a trash register that is not overlapped with arugments with jit helpers like writer barriers. With this change, I've tested various cases/scenraios locally. Also I've verified all tests are passed against mscorlib.ni.dll and System.ni.dll. Commit migrated from https://github.com/dotnet/coreclr/commit/3165d8e5bfddd78028c9eb2b5ea3ec995c71a849 --- src/coreclr/src/jit/codegenarm64.cpp | 48 ++++++++++++---------------- src/coreclr/src/jit/flowgraph.cpp | 2 ++ src/coreclr/src/jit/gentree.cpp | 13 ++++++++ src/coreclr/src/jit/gentree.h | 11 +++++++ src/coreclr/src/jit/importer.cpp | 11 +++++++ src/coreclr/src/jit/lower.cpp | 18 ++++++++++- src/coreclr/src/jit/target.h | 6 ++++ src/coreclr/src/zap/zapimport.cpp | 26 +++++---------- 8 files changed, 88 insertions(+), 47 deletions(-) diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index 574e438a70ca17..127d2791ffc36a 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -6730,39 +6730,31 @@ void CodeGen::genEmitHelperCall(unsigned helper, if (addr == nullptr) { - NYI("genEmitHelperCall indirect"); -#if 0 - assert(pAddr != nullptr); - if (genAddrCanBeEncodedAsPCRelOffset((size_t)pAddr)) + // This is call to a runtime helper. + // adrp x, [reloc:rel page addr] + // add x, x, [reloc:page offset] + // ldr x, [x] + // br x + + if (callTargetReg == REG_NA) { - // generate call whose target is specified by PC-relative 32-bit offset. - callType = emitter::EC_FUNC_TOKEN_INDIR; - addr = pAddr; + // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but + // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET. + callTargetReg = REG_DEFAULT_HELPER_CALL_TARGET; } - else - { - // If this address cannot be encoded as PC-relative 32-bit offset, load it into REG_HELPER_CALL_TARGET - // and use register indirect addressing mode to make the call. - // mov reg, addr - // call [reg] - if (callTargetReg == REG_NA) - { - // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but - // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET. - callTargetReg = REG_DEFAULT_HELPER_CALL_TARGET; - } - regMaskTP callTargetMask = genRegMask(callTargetReg); - regMaskTP callKillSet = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper); + regMaskTP callTargetMask = genRegMask(callTargetReg); + regMaskTP callKillSet = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper); - // assert that all registers in callTargetMask are in the callKillSet - noway_assert((callTargetMask & callKillSet) == callTargetMask); + // assert that all registers in callTargetMask are in the callKillSet + noway_assert((callTargetMask & callKillSet) == callTargetMask); - callTarget = callTargetReg; - CodeGen::genSetRegToIcon(callTarget, (ssize_t) pAddr, TYP_I_IMPL); - callType = emitter::EC_INDIR_ARD; - } -#endif // 0 + callTarget = callTargetReg; + + // adrp + add with relocations will be emitted + getEmitter()->emitIns_R_AI(INS_adrp, EA_PTR_DSP_RELOC, callTarget, (ssize_t)pAddr); + getEmitter()->emitIns_R_R(INS_ldr, EA_PTRSIZE, callTarget, callTarget); + callType = emitter::EC_INDIR_R; } getEmitter()->emitIns_Call(callType, diff --git a/src/coreclr/src/jit/flowgraph.cpp b/src/coreclr/src/jit/flowgraph.cpp index 548977593d436f..7ea818ed31c040 100644 --- a/src/coreclr/src/jit/flowgraph.cpp +++ b/src/coreclr/src/jit/flowgraph.cpp @@ -7047,6 +7047,8 @@ GenTreePtr Compiler::fgOptimizeDelegateConstructor(GenTreePtr call, CORINFO_C info.compCompHnd->getReadyToRunHelper(targetMethod->gtFptrVal.gtLdftnResolvedToken, CORINFO_HELP_READYTORUN_DELEGATE_CTOR, &call->gtCall.gtEntryPoint); #endif + // This is the case from GetDynamicHelperCell. + call->gtCall.setR2RRelativeIndir(); } } else diff --git a/src/coreclr/src/jit/gentree.cpp b/src/coreclr/src/jit/gentree.cpp index 55fa724baeeef8..103760528c44c5 100644 --- a/src/coreclr/src/jit/gentree.cpp +++ b/src/coreclr/src/jit/gentree.cpp @@ -4464,6 +4464,15 @@ unsigned Compiler::gtSetEvalOrder(GenTree * tree) ftreg |= RBM_VIRTUAL_STUB_PARAM; } +#ifdef FEATURE_READYTORUN_COMPILER +#ifdef _TARGET_ARM64_ + if (tree->gtCall.IsR2RRelativeIndir()) + { + ftreg |= RBM_R2R_INDIRECT_PARAM; + } +#endif +#endif + // Normally function calls don't preserve caller save registers // and thus are much more expensive. // However a few function calls do preserve these registers @@ -7403,6 +7412,10 @@ Compiler::gtDispNodeName(GenTree *tree) gtfType = " ind"; else if (tree->gtFlags & GTF_CALL_VIRT_STUB) gtfType = " stub"; +#ifdef FEATURE_READYTORUN_COMPILER + else if (tree->gtCall.IsR2RRelativeIndir()) + gtfType = " r2r_ind"; +#endif // FEATURE_READYTORUN_COMPILER else if (tree->gtFlags & GTF_CALL_UNMANAGED) { char * gtfTypeBufWalk = gtfTypeBuf; diff --git a/src/coreclr/src/jit/gentree.h b/src/coreclr/src/jit/gentree.h index 7a31deba3e9c21..d4035b445eae81 100644 --- a/src/coreclr/src/jit/gentree.h +++ b/src/coreclr/src/jit/gentree.h @@ -2762,6 +2762,8 @@ struct GenTreeCall final : public GenTree // a Pinvoke but not as an unmanaged call. See impCheckForPInvokeCall() to // know when these flags are set. +#define GTF_CALL_M_R2R_REL_INDIRECT 0x2000 // GT_CALL -- ready to run call is indirected through a relative address + bool IsUnmanaged() { return (gtFlags & GTF_CALL_UNMANAGED) != 0; } bool NeedsNullCheck() { return (gtFlags & GTF_CALL_NULLCHECK) != 0; } bool CallerPop() { return (gtFlags & GTF_CALL_POP_ARGS) != 0; } @@ -2870,6 +2872,15 @@ struct GenTreeCall final : public GenTree bool IsSameThis() { return (gtCallMoreFlags & GTF_CALL_M_NONVIRT_SAME_THIS) != 0; } bool IsDelegateInvoke(){ return (gtCallMoreFlags & GTF_CALL_M_DELEGATE_INV) != 0; } bool IsVirtualStubRelativeIndir() { return (gtCallMoreFlags & GTF_CALL_M_VIRTSTUB_REL_INDIRECT) != 0; } +#ifdef FEATURE_READYTORUN_COMPILER + bool IsR2RRelativeIndir() { return (gtCallMoreFlags & GTF_CALL_M_R2R_REL_INDIRECT) != 0; } + void setR2RRelativeIndir() { + if (gtEntryPoint.accessType == IAT_PVALUE) + { + gtCallMoreFlags |= GTF_CALL_M_R2R_REL_INDIRECT; + } + } +#endif // FEATURE_READYTORUN_COMPILER bool IsVarargs() { return (gtCallMoreFlags & GTF_CALL_M_VARARGS) != 0; } unsigned short gtCallMoreFlags; // in addition to gtFlags diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index 52a7a7148e19c3..703f604a290cca 100755 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -1647,6 +1647,9 @@ GenTreePtr Compiler::impReadyToRunHelperToTree( op1->gtCall.gtEntryPoint = lookup; + // This is the case from GetDynamicHelperCell. + op1->gtCall.setR2RRelativeIndir(); + return op1; } #endif @@ -4519,6 +4522,8 @@ GenTreePtr Compiler::impImportLdvirtftn (GenTreePtr thisPtr, call->gtEntryPoint = pCallInfo->codePointerLookup.constLookup; + // This is the case from GetDynamicHelperCell. + call->setR2RRelativeIndir(); return call; } #endif @@ -5192,6 +5197,9 @@ GenTreePtr Compiler::impImportStaticFieldAccess(CORINFO_RESOLVED_TOKEN * pResolv op1 = gtNewHelperCallNode(CORINFO_HELP_READYTORUN_STATIC_BASE, TYP_BYREF, callFlags); op1->gtCall.gtEntryPoint = pFieldInfo->fieldLookup; + + // This is the case from GetDynamicHelperCell. + op1->gtCall.setR2RRelativeIndir(); } else #endif @@ -6028,6 +6036,9 @@ var_types Compiler::impImportCall (OPCODE opcode, if (opts.IsReadyToRun()) { call->gtCall.gtEntryPoint = callInfo->codePointerLookup.constLookup; + + // This is the case from GetExternalMethodCell. + call->gtCall.setR2RRelativeIndir(); } #endif break; diff --git a/src/coreclr/src/jit/lower.cpp b/src/coreclr/src/jit/lower.cpp index 1df4f3bbcdb8d7..a8c11d5d17ff88 100644 --- a/src/coreclr/src/jit/lower.cpp +++ b/src/coreclr/src/jit/lower.cpp @@ -2399,10 +2399,26 @@ GenTree* Lowering::LowerDirectCall(GenTreeCall* call) break; case IAT_PVALUE: + { // Non-virtual direct calls to addresses accessed by // a single indirection. - result = Ind(AddrGen(addr)); + GenTree* cellAddr = AddrGen(addr); + GenTree* indir = Ind(cellAddr); + +#ifdef FEATURE_READYTORUN_COMPILER +#ifdef _TARGET_ARM64_ + // For arm64, we dispatch code same as VSD using X11 for indirection cell address, + // which ZapIndirectHelperThunk expects. + if (call->IsR2RRelativeIndir()) + { + cellAddr->gtRegNum = REG_R2R_INDIRECT_PARAM; + indir->gtRegNum = REG_JUMP_THUNK_PARAM; + } +#endif +#endif + result = indir; break; + } case IAT_PPVALUE: // Non-virtual direct calls to addresses accessed by diff --git a/src/coreclr/src/jit/target.h b/src/coreclr/src/jit/target.h index cd03315206509c..0b2bff3a215cbf 100644 --- a/src/coreclr/src/jit/target.h +++ b/src/coreclr/src/jit/target.h @@ -1514,6 +1514,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) #define RBM_CALLEE_TRASH_NOGC (RBM_R12|RBM_R13|RBM_R14|RBM_R15) + #define REG_DEFAULT_HELPER_CALL_TARGET REG_R12 #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) #define RBM_ALLFLOAT (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH) @@ -1606,6 +1607,11 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits #define RBM_VIRTUAL_STUB_PARAM RBM_R11 #define PREDICT_REG_VIRTUAL_STUB_PARAM PREDICT_REG_R11 + // R2R indirect call. Use the same registers as VSD + #define REG_R2R_INDIRECT_PARAM REG_R11 + #define RBM_R2R_INDIRECT_PARAM RBM_R11 + #define PREDICT_REG_RER_INDIRECT_PARAM PREDICT_REG_R11 + // Registers used by PInvoke frame setup #define REG_PINVOKE_FRAME REG_R8 #define RBM_PINVOKE_FRAME RBM_R8 diff --git a/src/coreclr/src/zap/zapimport.cpp b/src/coreclr/src/zap/zapimport.cpp index 37226fcc41ced6..323c7dd5cb849a 100644 --- a/src/coreclr/src/zap/zapimport.cpp +++ b/src/coreclr/src/zap/zapimport.cpp @@ -2146,17 +2146,8 @@ DWORD ZapIndirectHelperThunk::SaveWorker(ZapWriter * pZapWriter) #elif defined(_TARGET_ARM64_) if (IsDelayLoadHelper()) { - if (IsVSD()) - { - // x11 contains indirection cell - // Do nothing x11 contains our first param - } - else - { - // mov x11, x12 - *(DWORD*)p = 0xaa0c03eb; - p += 4; - } + // x11 contains indirection cell + // Do nothing x11 contains our first param // movz x8, #index DWORD index = GetSectionIndex(); @@ -2166,9 +2157,9 @@ DWORD ZapIndirectHelperThunk::SaveWorker(ZapWriter * pZapWriter) // move Module* -> x9 // ldr x9, [PC+0x14] - *(DWORD*)p = 0x58000289; + *(DWORD*)p = 0x580000A9; p += 4; - + //ldr x9, [x9] *(DWORD*)p = 0xf9400129; p += 4; @@ -2178,7 +2169,7 @@ DWORD ZapIndirectHelperThunk::SaveWorker(ZapWriter * pZapWriter) { // Move Module* -> x1 // ldr x1, [PC+0x14] - *(DWORD*)p = 0x58000289; + *(DWORD*)p = 0x580000A1; p += 4; // ldr x1, [x1] @@ -2187,10 +2178,8 @@ DWORD ZapIndirectHelperThunk::SaveWorker(ZapWriter * pZapWriter) } // branch to helper - - // mov x12, [helper] // ldr x12, [PC+0x14] - *(DWORD*)p = 0x58000289; + *(DWORD*)p = 0x580000AC; p += 4; // ldr x12, [x12] @@ -2199,12 +2188,13 @@ DWORD ZapIndirectHelperThunk::SaveWorker(ZapWriter * pZapWriter) // br x12 *(DWORD *)p = 0xd61f0180; - p += 4; + p += 4; // [Module*] if (pImage != NULL) pImage->WriteReloc(buffer, (int)(p - buffer), pImage->GetImportTable()->GetHelperImport(READYTORUN_HELPER_Module), 0, IMAGE_REL_BASED_PTR); p += 8; + // [helper] if (pImage != NULL) pImage->WriteReloc(buffer, (int)(p - buffer), pImage->GetImportTable()->GetHelperImport(GetReadyToRunHelper()), 0, IMAGE_REL_BASED_PTR);