Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64: Implement region write barriers #111636

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
96 changes: 96 additions & 0 deletions docs/design/coreclr/jit/GC-write-barriers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# GC write barriers

The GC write barrier function (JIT_WriteBarrier) is generally the hottest function in CoreCLR and is written in assembly. The full pseudo code for the function is as follows:


````
JIT_WriteBarrier(Object **dst, Object *ref)
Set *dst = ref

// Shadow Heap update
ifdef WRITE_BARRIER_CHECK:
if g_GCShadow != 0:
long *shadow_dst = g_GCShadow + (dst - g_lowest_address)
// Check shadow heap location is within shadow heap
if shadow_dst < g_GCShadowEnd:
*shadow_dst = ref
atomic: wait for stores to complete
if *dst != ref:
*shadow_dst = INVALIDGCVALUE

// Update the write watch table, if it's in use
ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP:
if g_sw_ww_table != 0:
char *ww_table_dst = g_sw_ww_table + (dst>>11)
if *ww_table_dst != 0:
*ww_table_dst = 0xff

// Return if the reference is not in Gen 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Return if the reference is not in Gen 0
// Return if the reference is not in ephemeral generations

if ref < g_ephemeral_low || ref >= g_ephemeral_high:
return

// Region Checks
if g_region_to_generation_table != 0:

// Calculate region locations
char reg_loc_dst = *((dst >> g_region_shr) + g_region_to_generation_table)
char reg_loc_ref = *((ref >> g_region_shr) + g_region_to_generation_table)

// Return if the region we're storing into is Gen 0
if reg_loc_dst == 0:
return

// Check this is going from old to young
if reg_loc_dst >= reg_loc_ref:
return

// Bitwise write barriers only
if g_region_use_bitwise_write_barrier:

char *card_table_dst = (dst >> 11) + g_card_table
char dst_bit = 1 << (dst >> 8 && 7)

// Check if we need to update the card table
if *card_table_dst & dst_bit == 0:
return

// Atomically update the card table
lock: *card_table_dst |= dst_bit

goto CardBundle

// Check if we need to update the card table
char *card_table_dst = (dst >> 11) + g_card_table
if *card_table_dst == 0xff:
return

// Update the card table
*card_table_dst = 0xff

CardBundle:

// Mark the card bundle table as dirty
Ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES:
char card_bundle_dst = (dst >> 21) + g_card_bundle_table
if *card_bundle_dst != 0xff:
*card_bundle_dst = 0xff

````

The Checked Write Barrier has additional checks:

````
JIT_CheckedWriteBarrier(Object **dst, Object *ref)

// Return if the destination is not on the heap
if ref < g_lowest_address || ref >= g_highest_address:
return

return JIT_WriteBarrier(dst, ref)
````



## WritebarrierManager

On AMD64, there several different implementations of the write barrier function. Each implementation assumes different state and so can skip certain checks. The actual write barrier that is called is a copy of one of these implementations. The WritebarrierManager keeps track of which implementation is currently being used. As internal state changes, the WritebarrierManager updates the copy to the correct implementation. In practice, most of the internal state is fixed on startup, with only changes to/from use of write watch barriers changing during runtime.
22 changes: 16 additions & 6 deletions src/coreclr/vm/arm64/asmhelpers.S
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,9 @@ LEAF_END ThePreStubPatch, _TEXT
LEAF_ENTRY JIT_UpdateWriteBarrierState, _TEXT
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -16

// x0-x7, x10 will contain intended new state
// x0-x7, x10-x11, x13-x14 will contain intended new state
// x8 will preserve skipEphemeralCheck
// x9 will preserve writeableOffset
// x12 will be used for pointers

mov x8, x0
Expand Down Expand Up @@ -231,12 +232,21 @@ LOCAL_LABEL(EphemeralCheckEnabled):
PREPARE_EXTERNAL_VAR g_highest_address, x12
ldr x6, [x12]

PREPARE_EXTERNAL_VAR g_region_to_generation_table, x12
ldr x7, [x12]

PREPARE_EXTERNAL_VAR g_region_shr, x12
ldr w10, [x12]

PREPARE_EXTERNAL_VAR g_region_use_bitwise_write_barrier, x12
ldr w11, [x12]

#ifdef WRITE_BARRIER_CHECK
PREPARE_EXTERNAL_VAR g_GCShadow, x12
ldr x7, [x12]
ldr x13, [x12]

PREPARE_EXTERNAL_VAR g_GCShadowEnd, x12
ldr x10, [x12]
ldr x14, [x12]
#endif

// Update wbs state
Expand All @@ -247,12 +257,12 @@ LOCAL_LABEL(EphemeralCheckEnabled):
stp x0, x1, [x12], 16
stp x2, x3, [x12], 16
stp x4, x5, [x12], 16
str x6, [x12], 8
stp x6, x7, [x12], 16
stp w10, w11, [x12], 8
#ifdef WRITE_BARRIER_CHECK
stp x7, x10, [x12], 16
stp x13, x14, [x12], 16
#endif


EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 16
EPILOG_RETURN
LEAF_END JIT_UpdateWriteBarrierState
Expand Down
60 changes: 56 additions & 4 deletions src/coreclr/vm/arm64/patchedcode.S
Original file line number Diff line number Diff line change
Expand Up @@ -142,33 +142,79 @@ LOCAL_LABEL(ShadowUpdateEnd):
#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
// Update the write watch table if necessary
ldr x12, LOCAL_LABEL(wbs_sw_ww_table)
cbz x12, LOCAL_LABEL(CheckCardTable)
cbz x12, LOCAL_LABEL(CheckCardTableBounds)
add x12, x12, x14, lsr #0xc // SoftwareWriteWatch::AddressToTableByteIndexShift
ldrb w17, [x12]
cbnz x17, LOCAL_LABEL(CheckCardTable)
cbnz x17, LOCAL_LABEL(CheckCardTableBounds)
mov w17, #0xFF
strb w17, [x12]
#endif

LOCAL_LABEL(CheckCardTable):
LOCAL_LABEL(CheckCardTableBounds):
// Branch to Exit if the reference is not in the Gen0 heap
ldr x12, LOCAL_LABEL(wbs_ephemeral_low)
ldr x17, LOCAL_LABEL(wbs_ephemeral_high)
cmp x15, x12
ccmp x15, x17, #0x2, hs
bhs LOCAL_LABEL(Exit)

// Region Checks

// Check if using regions
ldr x17, LOCAL_LABEL(wbs_region_to_generation_table)
cbz x17, LOCAL_LABEL(CheckCardTable)

// Calculate region locations
ldr w12, LOCAL_LABEL(wbs_region_shr)
lsr x15, x15, x12
add x15, x15, x17 // x15 = (RHS >> wbs_region_shr) + wbs_region_to_generation_table
lsr x12, x14, x12
add x12, x12, x17 // x12 = (LHS >> wbs_region_shr) + wbs_region_to_generation_table

// Check whether the region we're storing into is gen 0 - nothing to do in this case
ldrb w12, [x12]
cbz w12, LOCAL_LABEL(Exit)

// Check this is going from old to young
ldrb w15, [x15]
cmp w15, w12
bhs LOCAL_LABEL(Exit)

// Bitwise write barriers only
ldr w17, LOCAL_LABEL(wbs_region_use_bitwise_write_barrier)
cbz w17, LOCAL_LABEL(CheckCardTable)

// Check if we need to update the card table
lsr w17, w14, 8
and w17, w17, 7
movz w15, 1
lsl w17, w15, w17 // w17 = 1 << (LHS >> 8 && 7)
ldr x12, LOCAL_LABEL(wbs_card_table)
add x15, x12, x14, lsr #11
ldrb w12, [x15]
ldrb w12, [x15] // w12 = [(LHS >> 11) + g_card_table]
tst w12, w17
bne LOCAL_LABEL(Exit)

// Atomically update the card table
// Requires LSE, but the code is only compiled for 8.0
.word 0x383131FF // stsetb w17, [x15]
b LOCAL_LABEL(CheckCardBundleTable)

// End of Region Checks

LOCAL_LABEL(CheckCardTable):
// Check if we need to update the card table
ldr x12, LOCAL_LABEL(wbs_card_table)
add x15, x12, x14, lsr #11
ldrb w12, [x15] // w12 = [(LHS >> 11) + g_card_table]
cmp x12, 0xFF
beq LOCAL_LABEL(Exit)

// Update the card table
mov x12, 0xFF
strb w12, [x15]

LOCAL_LABEL(CheckCardBundleTable):
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
// Check if we need to update the card bundle table
ldr x12, LOCAL_LABEL(wbs_card_bundle_table)
Expand Down Expand Up @@ -208,6 +254,12 @@ LOCAL_LABEL(wbs_lowest_address):
.quad 0
LOCAL_LABEL(wbs_highest_address):
.quad 0
LOCAL_LABEL(wbs_region_to_generation_table):
.quad 0
LOCAL_LABEL(wbs_region_shr):
.word 0
LOCAL_LABEL(wbs_region_use_bitwise_write_barrier):
.word 0
#ifdef WRITE_BARRIER_CHECK
LOCAL_LABEL(wbs_GCShadow):
.quad 0
Expand Down
24 changes: 16 additions & 8 deletions src/coreclr/vm/arm64/stubs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ EXTERN_C void JIT_UpdateWriteBarrierState(bool skipEphemeralCheck, size_t writea
extern "C" void STDCALL JIT_PatchedCodeStart();
extern "C" void STDCALL JIT_PatchedCodeLast();

static void UpdateWriteBarrierState(bool skipEphemeralCheck)
static void UpdateWriteBarrierState()
{
BYTE *writeBarrierCodeStart = GetWriteBarrierCodeLocation((void*)JIT_PatchedCodeStart);
BYTE *writeBarrierCodeStartRW = writeBarrierCodeStart;
Expand All @@ -880,7 +880,15 @@ static void UpdateWriteBarrierState(bool skipEphemeralCheck)
writeBarrierWriterHolder.AssignExecutableWriterHolder(writeBarrierCodeStart, (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart);
writeBarrierCodeStartRW = writeBarrierWriterHolder.GetRW();
}
JIT_UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap(), writeBarrierCodeStartRW - writeBarrierCodeStart);

// Skip ephemeral checks for regionless server GC
bool skipEphemeralCheck = false;
if (GCHeapUtilities::IsServerHeap() && g_region_to_generation_table == nullptr)
{
skipEphemeralCheck = true;
}

JIT_UpdateWriteBarrierState(skipEphemeralCheck, writeBarrierCodeStartRW - writeBarrierCodeStart);
}

void InitJITHelpers1()
Expand Down Expand Up @@ -909,12 +917,12 @@ void InitJITHelpers1()
}
}

UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap());
UpdateWriteBarrierState();
}


#else
void UpdateWriteBarrierState(bool) {}
void UpdateWriteBarrierState() {}
#endif // !defined(DACCESS_COMPILE)

PTR_CONTEXT GetCONTEXTFromRedirectedStubStackFrame(T_DISPATCHER_CONTEXT * pDispatcherContext)
Expand Down Expand Up @@ -1076,26 +1084,26 @@ void FlushWriteBarrierInstructionCache()

int StompWriteBarrierEphemeral(bool isRuntimeSuspended)
{
UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap());
UpdateWriteBarrierState();
return SWB_PASS;
}

int StompWriteBarrierResize(bool isRuntimeSuspended, bool bReqUpperBoundsCheck)
{
UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap());
UpdateWriteBarrierState();
return SWB_PASS;
}

#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
int SwitchToWriteWatchBarrier(bool isRuntimeSuspended)
{
UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap());
UpdateWriteBarrierState();
return SWB_PASS;
}

int SwitchToNonWriteWatchBarrier(bool isRuntimeSuspended)
{
UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap());
UpdateWriteBarrierState();
return SWB_PASS;
}
#endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
Expand Down
25 changes: 23 additions & 2 deletions src/coreclr/vm/gcenv.ee.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
ThreadSuspend::RestartEE(FALSE, TRUE);
}
return; // unlike other branches we have already done cleanup so bailing out here

case WriteBarrierOp::StompEphemeral:
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
// StompEphemeral requires a new ephemeral low and a new ephemeral high
Expand All @@ -1063,8 +1064,16 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
g_region_to_generation_table = args->region_to_generation_table;
g_region_shr = args->region_shr;
g_region_use_bitwise_write_barrier = args->region_use_bitwise_write_barrier;
#if defined(HOST_ARM64)
// Only allow bitwise write barriers if LSE atomics are present
if (!g_arm64_atomics_present)
{
g_region_use_bitwise_write_barrier = false;
}
#endif
stompWBCompleteActions |= ::StompWriteBarrierEphemeral(args->is_runtime_suspended);
break;

case WriteBarrierOp::Initialize:
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
// This operation should only be invoked once, upon initialization.
Expand All @@ -1090,16 +1099,26 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
g_region_to_generation_table = args->region_to_generation_table;
g_region_shr = args->region_shr;
g_region_use_bitwise_write_barrier = args->region_use_bitwise_write_barrier;
g_ephemeral_low = args->ephemeral_low;
g_ephemeral_high = args->ephemeral_high;
#if defined(HOST_ARM64)
// Only allow bitwise write barriers if LSE atomics are present
if (!g_arm64_atomics_present)
{
g_region_use_bitwise_write_barrier = false;
}
#endif
stompWBCompleteActions |= ::StompWriteBarrierResize(true, false);

#if !defined(HOST_ARM64)
// StompWriteBarrierResize does not necessarily bash g_ephemeral_low
// usages, so we must do so here. This is particularly true on x86,
// where StompWriteBarrierResize will not bash g_ephemeral_low when
// called with the parameters (true, false), as it is above.
g_ephemeral_low = args->ephemeral_low;
g_ephemeral_high = args->ephemeral_high;
stompWBCompleteActions |= ::StompWriteBarrierEphemeral(true);
#endif
break;

case WriteBarrierOp::SwitchToWriteWatch:
#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
Expand All @@ -1111,6 +1130,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
assert(!"should never be called without FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP");
#endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
break;

case WriteBarrierOp::SwitchToNonWriteWatch:
#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
Expand All @@ -1121,6 +1141,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
assert(!"should never be called without FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP");
#endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
break;

default:
assert(!"unknown WriteBarrierOp enum");
}
Expand Down
Loading