From 1d6aa227c6a385a51c027e206c00582b46adbaee Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Sat, 22 Apr 2017 00:13:13 -0400 Subject: [PATCH 1/3] A few GC clean up/tweak * Rename and clean up thread heap initialization * More reliable inlining of `gc_setmark_buf` in `gc.c` * Move assertion part of `gc_datatype_assert` outline. --- src/gc.c | 44 +++++++++++++++++++++++++------------------- src/julia_internal.h | 2 +- src/julia_threads.h | 12 ++++++------ src/threading.c | 5 +---- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/gc.c b/src/gc.c index bfa5c9485e0e3b..55ba458ffa5179 100644 --- a/src/gc.c +++ b/src/gc.c @@ -616,10 +616,7 @@ STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o, } } -#ifndef __cplusplus -inline -#endif -void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) +STATIC_INLINE void gc_setmark_buf_(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) { jl_taggedvalue_t *buf = jl_astaggedvalue(o); uintptr_t tag = buf->header; @@ -642,6 +639,11 @@ void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) } } +void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) +{ + gc_setmark_buf_(ptls, o, mark_mode, minsz); +} + #define should_collect() (__unlikely(gc_num.allocd>0)) static inline int maybe_collect(jl_ptls_t ptls) @@ -1357,7 +1359,7 @@ NOINLINE static int gc_mark_module(jl_ptls_t ptls, jl_module_t *m, for(i=1; i < m->bindings.size; i+=2) { if (table[i] != HT_NOTFOUND) { jl_binding_t *b = (jl_binding_t*)table[i]; - gc_setmark_buf(ptls, b, bits, sizeof(jl_binding_t)); + gc_setmark_buf_(ptls, b, bits, sizeof(jl_binding_t)); void *vb = jl_astaggedvalue(b); verify_parent1("module", m, &vb, "binding_buff"); (void)vb; @@ -1438,11 +1440,11 @@ static void gc_mark_task_stack(jl_ptls_t ptls, jl_task_t *ta, int d, int8_t bits jl_ptls_t ptls2 = jl_all_tls_states[tid]; if (stkbuf) { #ifdef COPY_STACKS - gc_setmark_buf(ptls, ta->stkbuf, bits, ta->bufsz); + gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->bufsz); #else // stkbuf isn't owned by julia for the root task if (ta != ptls2->root_task) { - gc_setmark_buf(ptls, ta->stkbuf, bits, ta->ssize); + gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->ssize); } #endif } @@ -1493,10 +1495,8 @@ void gc_mark_object_list(jl_ptls_t ptls, arraylist_t *list, size_t start) } } -STATIC_INLINE void gc_assert_datatype(jl_datatype_t *vt) +JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_datatype_t *vt) { - if (__likely(jl_is_datatype(vt))) - return; jl_printf(JL_STDOUT, "GC error (probable corruption) :\n"); gc_debug_print_status(); jl_(vt); @@ -1504,11 +1504,12 @@ STATIC_INLINE void gc_assert_datatype(jl_datatype_t *vt) abort(); } -// for chasing down unwanted references -/* -static jl_value_t *lookforme = NULL; -JL_DLLEXPORT void jl_gc_lookfor(jl_value_t *v) { lookforme = v; } -*/ +STATIC_INLINE void gc_assert_datatype(jl_datatype_t *vt) +{ + if (__likely(jl_is_datatype(vt))) + return; + gc_assert_datatype_fail(vt); +} #define MAX_MARK_DEPTH 400 // Scan a marked object `v` and recursively mark its children. @@ -1567,8 +1568,8 @@ static void gc_scan_obj_(jl_ptls_t ptls, jl_value_t *v, int d, verify_parent1("array", v, &val_buf, "buffer ('loc' addr is meaningless)"); (void)val_buf; - gc_setmark_buf(ptls, (char*)a->data - a->offset * a->elsize, - bits, array_nbytes(a)); + gc_setmark_buf_(ptls, (char*)a->data - a->offset * a->elsize, + bits, array_nbytes(a)); } if (flags.ptrarray && a->data != NULL) { size_t l = jl_array_len(a); @@ -2128,8 +2129,8 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty) return jl_gc_alloc_(ptls, sz, ty); } -// Per-thread initialization (when threading is fully implemented) -void jl_mk_thread_heap(jl_ptls_t ptls) +// Per-thread initialization +void jl_init_thread_heap(jl_ptls_t ptls) { jl_thread_heap_t *heap = &ptls->heap; jl_gc_pool_t *p = heap->norm_pools; @@ -2151,6 +2152,11 @@ void jl_mk_thread_heap(jl_ptls_t ptls) arraylist_new(heap->remset, 0); arraylist_new(heap->last_remset, 0); arraylist_new(&ptls->finalizers, 0); + + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + gc_cache->perm_scanned_bytes = 0; + gc_cache->scanned_bytes = 0; + gc_cache->nbig_obj = 0; } // System-wide initializations diff --git a/src/julia_internal.h b/src/julia_internal.h index 00cb0f60c955f2..216d0833846ed0 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -512,7 +512,7 @@ void jl_gc_init(void); void jl_init_signal_async(void); void jl_init_debuginfo(void); void jl_init_runtime_ccall(void); -void jl_mk_thread_heap(jl_ptls_t ptls); +void jl_init_thread_heap(jl_ptls_t ptls); void _julia_init(JL_IMAGE_SEARCH rel); diff --git a/src/julia_threads.h b/src/julia_threads.h index bbb11d3e231378..7d7c1275880072 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -34,6 +34,12 @@ typedef struct { uint16_t osize; // size of objects in this pool } jl_gc_pool_t; +// Recursive spin lock +typedef struct { + volatile unsigned long owner; + uint32_t count; +} jl_mutex_t; + typedef struct { // variable for tracking weak references arraylist_t weak_refs; @@ -503,12 +509,6 @@ JL_DLLEXPORT void (jl_gc_safepoint)(void); } \ } while (0) -// Recursive spin lock -typedef struct { - volatile unsigned long owner; - uint32_t count; -} jl_mutex_t; - JL_DLLEXPORT void jl_gc_enable_finalizers(jl_ptls_t ptls, int on); static inline void jl_lock_frame_push(jl_mutex_t *lock); static inline void jl_lock_frame_pop(void); diff --git a/src/threading.c b/src/threading.c index baf4c2f808a4b9..4d45433932a5d9 100644 --- a/src/threading.c +++ b/src/threading.c @@ -266,9 +266,6 @@ static void ti_initthread(int16_t tid) ptls->tid = tid; ptls->pgcstack = NULL; ptls->gc_state = 0; // GC unsafe - ptls->gc_cache.perm_scanned_bytes = 0; - ptls->gc_cache.scanned_bytes = 0; - ptls->gc_cache.nbig_obj = 0; // Conditionally initialize the safepoint address. See comment in // `safepoint.c` if (tid == 0) { @@ -287,7 +284,7 @@ static void ti_initthread(int16_t tid) abort(); } ptls->bt_data = (uintptr_t*)bt_data; - jl_mk_thread_heap(ptls); + jl_init_thread_heap(ptls); jl_install_thread_signal_handler(ptls); jl_all_tls_states[tid] = ptls; From cc1ebd97bc5eb0e9bf5b844f8a0bf0b7cba775d6 Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Mon, 24 Apr 2017 19:51:10 -0400 Subject: [PATCH 2/3] Implement GC mark loop * Uses an iterative (mostly) Depth-first search (DFS) to mark all the objects. * Use two manually managed stacks instead of the native stack for better performance and compatibility with incremental/parallel scanning. --- src/gc-debug.c | 179 ++++++++++ src/gc.c | 849 ++++++++++++++++++++++++++++++++++++++++++++ src/gc.h | 143 +++++++- src/julia_threads.h | 4 + 4 files changed, 1173 insertions(+), 2 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index 7e77d7b6e9c2d1..de30f58986e653 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1186,6 +1186,185 @@ void gc_count_pool(void) jl_safe_printf("************************\n"); } +int gc_slot_to_fieldidx(void *obj, void *slot) +{ + jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj); + int nf = (int)jl_datatype_nfields(vt); + for (int i = 0; i < nf; i++) { + void *fieldaddr = (char*)obj + jl_field_offset(vt, i); + if (fieldaddr >= slot) { + return i; + } + } + return -1; +} + +int gc_slot_to_arrayidx(void *obj, void *_slot) +{ + char *slot = (char*)_slot; + jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj); + char *start = NULL; + size_t len = 0; + if (vt == jl_module_type) { + jl_module_t *m = (jl_module_t*)obj; + start = (char*)m->usings.items; + len = m->usings.len; + } + else if (vt == jl_simplevector_type) { + start = (char*)jl_svec_data(obj); + len = jl_svec_len(obj); + } + else if (vt->name == jl_array_typename) { + jl_array_t *a = (jl_array_t*)obj; + if (!a->flags.ptrarray) + return -1; + start = (char*)a->data; + len = jl_array_len(a); + } + if (slot < start || slot >= start + sizeof(void*) * len) + return -1; + return (slot - start) / sizeof(void*); +} + +// Print a backtrace from the bottom (start) of the mark stack up to `sp` +// `pc_offset` will be added to `sp` for convenience in the debugger. +NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, gc_mark_sp_t sp, int pc_offset) +{ + jl_jmp_buf *old_buf = ptls->safe_restore; + jl_jmp_buf buf; + ptls->safe_restore = &buf; + if (jl_setjmp(buf, 0) != 0) { + jl_printf((JL_STREAM*)STDERR_FILENO, + "\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n"); + ptls->safe_restore = old_buf; + return; + } + void **top = sp.pc + pc_offset; + char *data_top = sp.data; + sp.data = ptls->gc_cache.data_stack; + sp.pc = ptls->gc_cache.pc_stack; + int isroot = 1; + while (sp.pc < top) { + void *pc = *sp.pc; + const char *prefix = isroot ? "r--" : " `-"; + isroot = 0; + if (pc == gc_mark_label_addrs[0]) { + // marked_obj + gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ", + data, data->obj, (void*)data->tag, (int)data->bits); + jl_((void*)data->tag); + isroot = 1; + } + else if (pc == gc_mark_label_addrs[1]) { + // scan_only + gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ", + data, data->obj, (void*)data->tag, (int)data->bits); + jl_((void*)data->tag); + isroot = 1; + } + else if (pc == gc_mark_label_addrs[2]) { + // finlist + gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_safe_printf("%p: Finalizer list from %p to %p\n", data, data->begin, data->end); + isroot = 1; + } + else if (pc == gc_mark_label_addrs[3]) { + // objarray + gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ", + data, prefix, data->parent, ((void**)data->parent)[-1], + data->begin, data->end); + jl_(jl_typeof(data->parent)); + } + else if (pc == gc_mark_label_addrs[4]) { + // obj8 + gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); + jl_fielddesc8_t *desc = (jl_fielddesc8_t*)jl_dt_layout_fields(vt->layout); + jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ", + data, prefix, data->parent, ((void**)data->parent)[-1], + (int)(data->begin - desc), (int)(data->end - desc)); + jl_(jl_typeof(data->parent)); + } + else if (pc == gc_mark_label_addrs[5]) { + // obj16 + gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); + jl_fielddesc16_t *desc = (jl_fielddesc16_t*)jl_dt_layout_fields(vt->layout); + jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ", + data, prefix, data->parent, ((void**)data->parent)[-1], + (int)(data->begin - desc), (int)(data->end - desc)); + jl_(jl_typeof(data->parent)); + } + else if (pc == gc_mark_label_addrs[6]) { + // obj32 + gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); + jl_fielddesc32_t *desc = (jl_fielddesc32_t*)jl_dt_layout_fields(vt->layout); + jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ", + data, prefix, data->parent, ((void**)data->parent)[-1], + (int)(data->begin - desc), (int)(data->end - desc)); + jl_(jl_typeof(data->parent)); + } + else if (pc == gc_mark_label_addrs[7]) { + // stack + gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n", + data, prefix, data->s, (int)data->i, (int)data->nroots >> 1, + (data->nroots & 1) ? "indirect" : "direct"); + } + else if (pc == gc_mark_label_addrs[8]) { + // module_binding + gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t); + if ((char*)data > data_top) { + jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); + break; + } + jl_safe_printf("%p: %s Module (bindings) %p (bits %d) -- [%p, %p)\n", + data, prefix, data->parent, (int)data->bits, data->begin, data->end); + } + else { + jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc); + break; + } + } + ptls->safe_restore = old_buf; +} + #ifdef __cplusplus } #endif diff --git a/src/gc.c b/src/gc.c index 55ba458ffa5179..732404523ccf9e 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1511,6 +1511,845 @@ STATIC_INLINE void gc_assert_datatype(jl_datatype_t *vt) gc_assert_datatype_fail(vt); } +// This stores the label address in the mark loop function. +// We can't directly store that to a global array so we need some hack to get that. +// See the call to `gc_mark_loop` in init with a `NULL` `ptls`. +void *gc_mark_label_addrs[gc_mark_nlabels]; + +// Double the mark stack (both pc and data) with the lock held. +static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp) +{ + char *old_data = gc_cache->data_stack; + void **pc_stack = sp->pc_start; + size_t stack_size = (char*)sp->pc_end - (char*)pc_stack; + JL_LOCK_NOGC(&gc_cache->stack_lock); + gc_cache->data_stack = (char*)realloc(old_data, stack_size * 2 * sizeof(gc_mark_sp_t)); + sp->data += gc_cache->data_stack - old_data; + + sp->pc_start = gc_cache->pc_stack = (void**)realloc(pc_stack, stack_size * 2 * sizeof(void*)); + gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2; + sp->pc += sp->pc_start - pc_stack; + JL_UNLOCK_NOGC(&gc_cache->stack_lock); +} + +// Push a work to the stack. The type of the work is marked with `pc` and the data needed +// is in `data` and is of size `data_size`. +// The `sp` keeps track of the current stack pointer and will be updated on return. +// If there isn't enough space on the stack anymore, the stack will be resized with the stack +// lock held. The caller should invalidate any local cache of the stack addresses that's not +// in `gc_cache` or `sp` +// The caller is also responsible for increasing `pc`. +STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, + void *pc, void *data, size_t data_size, int inc) +{ + assert(data_size <= sizeof(gc_mark_data_t)); + if (__unlikely(sp->pc == sp->pc_end)) + gc_mark_stack_resize(gc_cache, sp); + *sp->pc = pc; + memcpy(sp->data, data, data_size); + if (inc) { + sp->data += data_size; + sp->pc++; + } +} + +// Check if the reference is non-NULL and atomically set the mark bit. +// Update `*nptr`, which is the `nptr` field of the parent item, if the object is young. +// Return the tag (with GC bits cleared) and the GC bits in `*ptag` and `*pbits`. +// Return whether the object needs to be scanned / have metadata updated. +STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr, + uintptr_t *ptag, uint8_t *pbits) +{ + if (!obj) + return 0; + jl_taggedvalue_t *o = jl_astaggedvalue(obj); + uintptr_t tag = o->header; + if (!gc_marked(tag)) { + uint8_t bits; + int res = gc_setmark_tag(o, GC_MARKED, tag, &bits); + if (!gc_old(bits)) + *nptr = *nptr | 1; + *ptag = tag & ~(uintptr_t)0xf; + *pbits = bits; + return __likely(res); + } + else if (!gc_old(tag)) { + *nptr = *nptr | 1; + } + return 0; +} + +// Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`. +void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, + arraylist_t *list, size_t start) +{ + size_t len = list->len; + if (len <= start) + return; + jl_value_t **items = (jl_value_t**)list->items; + gc_mark_finlist_t markdata = {items + start, items + len}; + gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[2], &markdata, sizeof(markdata), 1); +} + +// Queue a object to be scanned. The object should already be marked and the GC metadata +// should already be updated for it. Only scanning of the object should be performed. +STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, + jl_value_t *obj) +{ + jl_taggedvalue_t *o = jl_astaggedvalue(obj); + uintptr_t tag = o->header; + uint8_t bits = tag & 0xf; + tag = tag & ~(uintptr_t)0xf; + gc_mark_marked_obj_t data = {obj, tag, bits}; + gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[1], &data, sizeof(data), 1); +} + +// Mark and queue a object to be scanned. +// The object will be marked atomically which can also happen concurrently. +// It will be queued if the object wasn't marked already (or concurrently by another thread) +// Returns whether the object is young. +STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, void *_obj) +{ + jl_value_t *obj = (jl_value_t*)jl_assume(_obj); + uintptr_t nptr = 0; + uintptr_t tag = 0; + uint8_t bits = 0; + if (!gc_try_setmark(obj, &nptr, &tag, &bits)) + return (int)nptr; + gc_mark_marked_obj_t data = {obj, tag, bits}; + gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[0], &data, sizeof(data), 1); + return (int)nptr; +} + +// Check if `nptr` is tagged for `old + refyoung`, +// Push the object to the remset and update the `nptr` counter if necessary. +STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) +{ + if (__unlikely((nptr & 0x3) == 0x3)) { + ptls->heap.remset_nptr += nptr >> 2; + arraylist_t *remset = ptls->heap.remset; + size_t len = remset->len; + if (__unlikely(len >= remset->max)) { + arraylist_push(remset, obj); + } + else { + remset->len = len + 1; + remset->items[len] = obj; + } + } +} + +// Scan a dense array of object references, see `gc_mark_objarray_t` +STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, gc_mark_sp_t *sp, + gc_mark_objarray_t *objary, + jl_value_t **begin, jl_value_t **end, + jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +{ + jl_assume(objary == (gc_mark_objarray_t*)sp->data); + for (; begin < end; begin++) { + *pnew_obj = *begin; + if (*pnew_obj) + verify_parent2("obj array", objary->parent, begin, "elem(%d)", + gc_slot_to_arrayidx(objary->parent, begin)); + if (!gc_try_setmark(*pnew_obj, &objary->nptr, ptag, pbits)) + continue; + begin++; + // Found an object to mark + if (begin < end) { + // Haven't done with this one yet. Update the content and push it back + objary->begin = begin; + gc_repush_markdata(sp, gc_mark_objarray_t); + } + else { + // Finished scaning this one, finish up by checking the GC invariance + // and let the next item replacing the current one directly. + gc_mark_push_remset(ptls, objary->parent, objary->nptr); + } + return 1; + } + gc_mark_push_remset(ptls, objary->parent, objary->nptr); + return 0; +} + +// Scan an object with 8bits field descriptors. see `gc_mark_obj8_t` +STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, gc_mark_sp_t *sp, gc_mark_obj8_t *obj8, + char *parent, jl_fielddesc8_t *begin, jl_fielddesc8_t *end, + jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +{ + jl_assume(obj8 == (gc_mark_obj8_t*)sp->data); + jl_assume(begin < end); + for (; begin < end; begin++) { + if (!begin->isptr) + continue; + jl_value_t **slot = (jl_value_t**)(parent + begin->offset); + *pnew_obj = *slot; + if (*pnew_obj) + verify_parent2("object", parent, slot, "field(%d)", + gc_slot_to_fieldidx(parent, slot)); + if (!gc_try_setmark(*pnew_obj, &obj8->nptr, ptag, pbits)) + continue; + begin++; + // Found an object to mark + if (begin < end) { + // Haven't done with this one yet. Update the content and push it back + obj8->begin = begin; + gc_repush_markdata(sp, gc_mark_obj8_t); + } + else { + // Finished scaning this one, finish up by checking the GC invariance + // and let the next item replacing the current one directly. + gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); + } + return 1; + } + gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); + return 0; +} + +// Scan an object with 16bits field descriptors. see `gc_mark_obj16_t` +STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, gc_mark_sp_t *sp, gc_mark_obj16_t *obj16, + char *parent, jl_fielddesc16_t *begin, jl_fielddesc16_t *end, + jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +{ + jl_assume(obj16 == (gc_mark_obj16_t*)sp->data); + jl_assume(begin < end); + for (; begin < end; begin++) { + if (!begin->isptr) + continue; + jl_value_t **slot = (jl_value_t**)(parent + begin->offset); + *pnew_obj = *slot; + if (*pnew_obj) + verify_parent2("object", parent, slot, "field(%d)", + gc_slot_to_fieldidx(parent, slot)); + if (!gc_try_setmark(*pnew_obj, &obj16->nptr, ptag, pbits)) + continue; + begin++; + // Found an object to mark + if (begin < end) { + // Haven't done with this one yet. Update the content and push it back + obj16->begin = begin; + gc_repush_markdata(sp, gc_mark_obj16_t); + } + else { + // Finished scaning this one, finish up by checking the GC invariance + // and let the next item replacing the current one directly. + gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); + } + return 1; + } + gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); + return 0; +} + +// Scan an object with 32bits field descriptors. see `gc_mark_obj32_t` +STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, gc_mark_sp_t *sp, gc_mark_obj32_t *obj32, + char *parent, jl_fielddesc32_t *begin, jl_fielddesc32_t *end, + jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +{ + jl_assume(obj32 == (gc_mark_obj32_t*)sp->data); + jl_assume(begin < end); + for (; begin < end; begin++) { + if (!begin->isptr) + continue; + jl_value_t **slot = (jl_value_t**)(parent + begin->offset); + *pnew_obj = *slot; + if (*pnew_obj) + verify_parent2("object", parent, slot, "field(%d)", + gc_slot_to_fieldidx(parent, slot)); + if (!gc_try_setmark(*pnew_obj, &obj32->nptr, ptag, pbits)) + continue; + begin++; + // Found an object to mark + if (begin < end) { + // Haven't done with this one yet. Update the content and push it back + obj32->begin = begin; + gc_repush_markdata(sp, gc_mark_obj32_t); + } + else { + // Finished scaning this one, finish up by checking the GC invariance + // and let the next item replacing the current one directly. + gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); + } + return 1; + } + gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); + return 0; +} + +#ifdef __GNUC__ +# define gc_mark_laddr(name) (&&name) +# define gc_mark_jmp(ptr) goto *(ptr) +#else +enum { + GC_MARK_L_marked_obj, + GC_MARK_L_scan_only, + GC_MARK_L_finlist, + GC_MARK_L_objarray, + GC_MARK_L_obj8, + GC_MARK_L_obj16, + GC_MARK_L_obj32, + GC_MARK_L_stack, + GC_MARK_L_module_binding +}; +#define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name) +#define gc_mark_jmp(ptr) do { \ + switch ((int)(uintptr_t)ptr) { \ + case GC_MARK_L_marked_obj: \ + goto marked_obj; \ + case GC_MARK_L_scan_only: \ + goto scan_only; \ + case GC_MARK_L_finlist: \ + goto finlist; \ + case GC_MARK_L_objarray: \ + goto objarray; \ + case GC_MARK_L_obj8: \ + goto obj8; \ + case GC_MARK_L_obj16: \ + goto obj16; \ + case GC_MARK_L_obj32: \ + goto obj32; \ + case GC_MARK_L_stack: \ + goto stack; \ + case GC_MARK_L_module_binding: \ + goto module_binding; \ + default: \ + abort(); \ + } \ + } while (0) +#endif + +// This is the main marking loop. +// It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects. +// Instead of using the native stack, two stacks are manually maintained, +// one (fixed-size) pc stack which stores the return address and one (variable-size) +// data stack which stores the local variables needed by the scanning code. +// Using a manually maintained stack has a few advantages +// +// 1. We can resize the stack as we go and never worry about stack overflow +// This is especitally useful when enters the GC in a deep call stack. +// It also removes the very deep GC call stack in a profile. +// 2. We can minimize the number of local variables to save on the stack. +// This includes minimizing the sizes of the stack frames and only saving variables +// that have been changed before making "function calls" (i.e. `goto mark;`) +// 3. We can perform end-of-loop tail-call optimization for common cases. +// 4. The marking can be interrupted more easily since all the states are maintained +// in a well-defined format already. +// This will be useful if we want to have incremental marking again. +// 5. The frames can be stolen by another thread more easily and it is not necessary +// to copy works to be stolen to another queue. Useful for parallel marking. +// (Will still require synchronization in stack popping of course.) +// 6. A flat function (i.e. no or very few function calls) also give the compiler +// opportunity to keep more states in registers that doesn't have to be spilled as often. +// +// We use two stacks so that the thief on another thread can steal the fixed sized pc stack +// and use that to figure out the size of the struct on the variable size data stack. +// +// The main disadvantages are that we bypass some stack-based CPU optimizations including the +// stack engine and return address prediction. +// Using two stacks also double the number of operations on the stack pointer +// though we still only need to use one of them (the pc stack pointer) for bounds check. +// In general, it seems that the reduction of stack memory ops and instructions count +// have a larger possitive effect on the performance. =) + +// As a general guide we do not want to make non-inlined function calls in this function +// if possible since a large number of registers has to be spilled when that happens. +// This is especially true on on X86 which doesn't have many (any?) +// callee saved general purpose registers. +// (OTOH, the spill will likely make use of the stack engine which is otherwise idle so +// the performance impact is minimum as long as it's not in the hottest path) + +// There are three external entry points to the loop, corresponding to label +// `marked_obj`, `scan_only` and `finlist` (see the corresponding functions +// `gc_mark_queue_obj`, `gc_mark_queue_scan_obj` and `gc_mark_queue_finlist` above). +// The scanning of the object starts with `goto mark`, which updates the metadata and scans +// the object whose information is stored in `new_obj`, `tag` and `bits`. +// The branches in `mark` will dispatch the object to one of the scan "loop"s to be scanned +// as either a normal julia object or one of the special objects with specific storage format. +// Each of the scan "loop" will preform a DFS of the object in the following way +// +// 1. When encountering an pointer (julia object reference) slots, load, preform NULL check +// and atomically set the mark bits to determine if the object needs to be scanned. +// 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed) +// using `gc_repush_markdata` to increment the stack pointers. +// This step can also be replaced by a tail call by finishing up the marking of the current +// object when the end of the current object is reached. +// 3. Jump to `mark`. The marking of the current object will be resumed after the child is +// scanned by popping the stack frame back. +// +// Some of the special object scannings use BFS to simplify the code (Task and Module). + +// The jumps from the dispatch to the scan "loop"s are done by first pushing a frame +// to the stacks while only increment the data stack pointer before jumping to the loop +// This way the scan "loop" gets exactly what it expects after a stack pop. +// Additional optimizations are done for some of the common cases by skipping +// the unnecessary data stack pointer increment and the load from the stack +// (i.e. store to load forwaring). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`. +NOINLINE void *const *gc_mark_loop(jl_ptls_t ptls, gc_mark_sp_t sp) +{ + static void *const label_addrs[] = { + gc_mark_laddr(marked_obj), gc_mark_laddr(scan_only), gc_mark_laddr(finlist), + gc_mark_laddr(objarray), gc_mark_laddr(obj8), gc_mark_laddr(obj16), + gc_mark_laddr(obj32), gc_mark_laddr(stack), gc_mark_laddr(module_binding) + }; + static_assert(sizeof(label_addrs) == gc_mark_nlabels * sizeof(void*), + "Label number out-of-sync"); + + if (__unlikely(ptls == NULL)) + return label_addrs; + + jl_value_t *new_obj = NULL; + uintptr_t tag = 0; + uint8_t bits = 0; + int meta_updated = 0; + + gc_mark_objarray_t *objary; + jl_value_t **objary_begin; + jl_value_t **objary_end; + + gc_mark_obj8_t *obj8; + char *obj8_parent; + jl_fielddesc8_t *obj8_begin; + jl_fielddesc8_t *obj8_end; + + gc_mark_obj16_t *obj16; + char *obj16_parent; + jl_fielddesc16_t *obj16_begin; + jl_fielddesc16_t *obj16_end; + +pop: + if (sp.pc == sp.pc_start) { + // TODO: stealing form another thread + return NULL; + } + sp.pc--; + gc_mark_jmp(*sp.pc); // computed goto + +marked_obj: { + // An object that has been marked and needs have metadata updated and scanned. + gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); + new_obj = obj->obj; + tag = obj->tag; + bits = obj->bits; + goto mark; + } + +scan_only: { + // An object that has been marked and needs to be scanned. + gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); + new_obj = obj->obj; + tag = obj->tag; + bits = obj->bits; + meta_updated = 1; + goto mark; + } + +objarray: + objary = gc_pop_markdata(&sp, gc_mark_objarray_t); + objary_begin = objary->begin; + objary_end = objary->end; +objarray_loaded: + if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end, + &new_obj, &tag, &bits)) + goto mark; + goto pop; + +obj8: + obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t); + obj8_parent = (char*)obj8->parent; + obj8_begin = obj8->begin; + obj8_end = obj8->end; +obj8_loaded: + if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end, + &new_obj, &tag, &bits)) + goto mark; + goto pop; + +obj16: + obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t); + obj16_parent = (char*)obj16->parent; + obj16_begin = obj16->begin; + obj16_end = obj16->end; +obj16_loaded: + if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end, + &new_obj, &tag, &bits)) + goto mark; + goto pop; + +obj32: { + gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t); + char *parent = (char*)obj32->parent; + jl_fielddesc32_t *begin = obj32->begin; + jl_fielddesc32_t *end = obj32->end; + if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits)) + goto mark; + goto pop; + } + +stack: { + // Scan the stack. see `gc_mark_stackframe_t` + // The task object this stack belongs to is being scanned separately as a normal + // 8bit field descriptor object. + gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t); + jl_gcframe_t *s = stack->s; + size_t i = stack->i; + size_t nroots = stack->nroots; + uintptr_t offset = stack->offset; + uintptr_t lb = stack->lb; + uintptr_t ub = stack->ub; + size_t nr = nroots >> 1; + uintptr_t nptr = 0; + while (1) { + jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2); + for (; i < nr; i++) { + if (nroots & 1) { + void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub); + new_obj = (jl_value_t*)gc_read_stack(slot, offset, lb, ub); + } + else { + new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub); + } + if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) + continue; + i++; + if (i < nr) { + // Haven't done with this one yet. Update the content and push it back + stack->i = i; + gc_repush_markdata(&sp, gc_mark_stackframe_t); + } + else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) { + stack->s = s; + stack->i = 0; + stack->nroots = gc_read_stack(&s->nroots, offset, lb, ub); + gc_repush_markdata(&sp, gc_mark_stackframe_t); + } + goto mark; + } + s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub); + if (s != 0) { + stack->s = s; + i = 0; + nroots = stack->nroots = gc_read_stack(&s->nroots, offset, lb, ub); + nr = nroots >> 1; + continue; + } + goto pop; + } + } + +module_binding: { + // Scan a module. see `gc_mark_binding_t` + // Other fields of the module will be scanned after the bindings are scanned + gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t); + jl_binding_t **begin = binding->begin; + jl_binding_t **end = binding->end; + uint8_t mbits = binding->bits; + for (; begin < end; begin += 2) { + jl_binding_t *b = *begin; + if (b == (jl_binding_t*)HT_NOTFOUND) + continue; + gc_setmark_buf_(ptls, b, mbits, sizeof(jl_binding_t)); + void *vb = jl_astaggedvalue(b); + verify_parent1("module", binding->parent, &vb, "binding_buff"); + (void)vb; + jl_value_t *value = b->value; + jl_value_t *globalref = b->globalref; + if (value) { + verify_parent2("module", binding->parent, + &b->value, "binding(%s)", jl_symbol_name(b->name)); + if (gc_try_setmark(value, &binding->nptr, &tag, &bits)) { + new_obj = value; + begin += 2; + binding->begin = begin; + gc_repush_markdata(&sp, gc_mark_binding_t); + uintptr_t gr_tag; + uint8_t gr_bits; + if (gc_try_setmark(globalref, &binding->nptr, &gr_tag, &gr_bits)) { + gc_mark_marked_obj_t data = {globalref, gr_tag, gr_bits}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(marked_obj), + &data, sizeof(data), 1); + } + goto mark; + } + } + if (gc_try_setmark(globalref, &binding->nptr, &tag, &bits)) { + begin += 2; + binding->begin = begin; + gc_repush_markdata(&sp, gc_mark_binding_t); + new_obj = globalref; + goto mark; + } + } + jl_module_t *m = binding->parent; + int scanparent = gc_try_setmark((jl_value_t*)m->parent, &binding->nptr, &tag, &bits); + size_t nusings = m->usings.len; + if (nusings) { + // this is only necessary because bindings for "using" modules + // are added only when accessed. therefore if a module is replaced + // after "using" it but before accessing it, this array might + // contain the only reference. + objary_begin = (jl_value_t**)m->usings.items; + objary_end = objary_begin + nusings; + gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, binding->nptr}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), + &data, sizeof(data), 0); + if (!scanparent) { + objary = (gc_mark_objarray_t*)sp.data; + goto objarray_loaded; + } + sp.data += sizeof(data); + sp.pc++; + } + else { + gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr); + } + if (scanparent) { + new_obj = (jl_value_t*)m->parent; + goto mark; + } + goto pop; + } + +finlist: { + // Scan a finalizer list. see `gc_mark_finlist_t` + gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t); + jl_value_t **begin = finlist->begin; + jl_value_t **end = finlist->end; + for (; begin < end; begin++) { + new_obj = *begin; + if (__unlikely(!new_obj)) + continue; + if (gc_ptr_tag(new_obj, 1)) { + new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1); + begin++; + assert(begin < end); + } + uintptr_t nptr = 0; + if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) + continue; + begin++; + // Found an object to mark + if (begin < end) { + // Haven't done with this one yet. Update the content and push it back + finlist->begin = begin; + gc_repush_markdata(&sp, gc_mark_finlist_t); + } + goto mark; + } + goto pop; + } + +mark: { + // Generic scanning entry point. + // Expects `new_obj`, `tag` and `bits` to be set correctly. +#ifdef JL_DEBUG_BUILD + if (new_obj == gc_findval) + jl_raise_debugger(); +#endif + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + jl_datatype_t *vt = (jl_datatype_t*)tag; + int update_meta = __likely(!meta_updated && !gc_verifying); + meta_updated = 0; + // Symbols are always marked + assert(vt != jl_symbol_type); + if (vt == jl_simplevector_type) { + size_t l = jl_svec_len(new_obj); + jl_value_t **data = jl_svec_data(new_obj); + if (update_meta) + gc_setmark(ptls, o, bits, l * sizeof(void*) + sizeof(jl_svec_t)); + uintptr_t nptr = (l << 2) | (bits & GC_OLD); + objary_begin = data; + objary_end = data + l; + gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, nptr}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), + &markdata, sizeof(markdata), 0); + objary = (gc_mark_objarray_t*)sp.data; + goto objarray_loaded; + } + else if (vt->name == jl_array_typename) { + jl_array_t *a = (jl_array_t*)new_obj; + jl_array_flags_t flags = a->flags; + if (update_meta) { + if (flags.pooled) + gc_setmark_pool(ptls, o, bits); + else + gc_setmark_big(ptls, o, bits); + } + if (flags.how == 1) { + void *val_buf = jl_astaggedvalue((char*)a->data - a->offset * a->elsize); + verify_parent1("array", new_obj, &val_buf, "buffer ('loc' addr is meaningless)"); + (void)val_buf; + gc_setmark_buf_(ptls, (char*)a->data - a->offset * a->elsize, + bits, array_nbytes(a)); + } + else if (flags.how == 2) { + if (update_meta) { + objprofile_count(jl_malloc_tag, bits == GC_OLD_MARKED, + array_nbytes(a)); + if (bits == GC_OLD_MARKED) { + ptls->gc_cache.perm_scanned_bytes += array_nbytes(a); + } + else { + ptls->gc_cache.scanned_bytes += array_nbytes(a); + } + } + } + else if (flags.how == 3) { + jl_value_t *owner = jl_array_data_owner(a); + uintptr_t nptr = (1 << 2) | (bits & GC_OLD); + int markowner = gc_try_setmark(owner, &nptr, &tag, &bits); + gc_mark_push_remset(ptls, new_obj, nptr); + if (markowner) { + new_obj = owner; + goto mark; + } + goto pop; + } + if (!flags.ptrarray || a->data == NULL) + goto pop; + size_t l = jl_array_len(a); + uintptr_t nptr = (l << 2) | (bits & GC_OLD); + objary_begin = (jl_value_t**)a->data; + objary_end = objary_begin + l; + gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, nptr}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), + &markdata, sizeof(markdata), 0); + objary = (gc_mark_objarray_t*)sp.data; + goto objarray_loaded; + } + else if (vt == jl_module_type) { + if (update_meta) + gc_setmark(ptls, o, bits, sizeof(jl_module_t)); + jl_module_t *m = (jl_module_t*)new_obj; + jl_binding_t **table = (jl_binding_t**)m->bindings.table; + size_t bsize = m->bindings.size; + uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD); + gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr, bits}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding), + &markdata, sizeof(markdata), 0); + sp.data += sizeof(markdata); + goto module_binding; + } + else if (vt == jl_task_type) { + if (update_meta) + gc_setmark(ptls, o, bits, sizeof(jl_task_t)); + assert(jl_task_type->layout->fielddesc_type == 0); + jl_fielddesc8_t *desc = (jl_fielddesc8_t*)jl_dt_layout_fields(jl_task_type->layout); + size_t nfields = jl_task_type->layout->nfields; + assert(nfields > 0); + jl_fielddesc8_t *desc_end = desc + nfields; + gc_mark_obj8_t markdata = {new_obj, desc, desc_end, (9 << 2) | 1 | bits}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), + &markdata, sizeof(markdata), 0); + jl_task_t *ta = (jl_task_t*)new_obj; + gc_scrub_record_task(ta); + int stkbuf = (ta->stkbuf != (void*)(intptr_t)-1 && ta->stkbuf != NULL); + int16_t tid = ta->tid; + jl_ptls_t ptls2 = jl_all_tls_states[tid]; + if (stkbuf) { +#ifdef COPY_STACKS + gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->bufsz); +#else + // stkbuf isn't owned by julia for the root task + if (ta != ptls2->root_task) { + gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->ssize); + } +#endif + } + jl_gcframe_t *s = NULL; + size_t nroots; + uintptr_t offset = 0; + uintptr_t lb = 0; + uintptr_t ub = (uintptr_t)-1; + if (ta == ptls2->current_task) { + s = ptls2->pgcstack; + } + else if (stkbuf) { + s = ta->gcstack; +#ifdef COPY_STACKS + ub = (uintptr_t)ptls2->stackbase; + lb = ub - ta->ssize; + offset = (uintptr_t)ta->stkbuf - lb; +#endif + } + if (s) { + sp.data += sizeof(markdata); + sp.pc++; + nroots = gc_read_stack(&s->nroots, offset, lb, ub); + gc_mark_stackframe_t stackdata = {s, 0, nroots, offset, lb, ub}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack), + &stackdata, sizeof(stackdata), 0); + sp.data += sizeof(stackdata); + goto stack; + } + else { + obj8 = (gc_mark_obj8_t*)sp.data; + obj8_parent = (char*)ta; + obj8_begin = desc; + obj8_end = desc_end; + goto obj8_loaded; + } + } + else if (vt == jl_string_type) { + if (update_meta) + gc_setmark(ptls, o, bits, jl_string_len(new_obj) + sizeof(size_t) + 1); + goto pop; + } + else { + gc_assert_datatype(vt); + if (update_meta) + gc_setmark(ptls, o, bits, jl_datatype_size(vt)); + if (vt == jl_weakref_type) + goto pop; + const jl_datatype_layout_t *layout = vt->layout; + uint32_t npointers = layout->npointers; + if (npointers == 0) + goto pop; + uintptr_t nptr = ((npointers & 0xff) << (npointers & 0x300)) << 2; + nptr = nptr | (bits & GC_OLD); + uint32_t offsets = jl_datatype_layout_n_nonptr(layout); + size_t nfields = layout->nfields; + nfields -= offsets & 0xffff; + size_t first = offsets >> 16; + if (layout->fielddesc_type == 0) { + jl_fielddesc8_t *desc = (jl_fielddesc8_t*)jl_dt_layout_fields(layout); + obj8_parent = (char*)new_obj; + obj8_begin = desc + first; + obj8_end = desc + nfields; + assert(obj8_begin < obj8_end); + gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), + &markdata, sizeof(markdata), 0); + obj8 = (gc_mark_obj8_t*)sp.data; + goto obj8_loaded; + } + else if (layout->fielddesc_type == 1) { + jl_fielddesc16_t *desc = (jl_fielddesc16_t*)jl_dt_layout_fields(layout); + obj16_parent = (char*)new_obj; + obj16_begin = desc + first; + obj16_end = desc + nfields; + assert(obj16_begin < obj16_end); + gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16), + &markdata, sizeof(markdata), 0); + obj16 = (gc_mark_obj16_t*)sp.data; + goto obj16_loaded; + } + else { + // This is very uncommon + // Do not do store to load forwarding to save some code size + assert(layout->fielddesc_type == 2); + jl_fielddesc32_t *desc = (jl_fielddesc32_t*)jl_dt_layout_fields(layout); + assert(first < nfields); + gc_mark_obj32_t markdata = {new_obj, desc + first, desc + nfields, nptr}; + gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32), + &markdata, sizeof(markdata), 0); + sp.data += sizeof(markdata); + goto obj32; + } + } + } +} + #define MAX_MARK_DEPTH 400 // Scan a marked object `v` and recursively mark its children. // The object will be queued on the mark stack when recursion depth @@ -2157,6 +2996,11 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; gc_cache->nbig_obj = 0; + JL_MUTEX_INIT(&gc_cache->stack_lock); + size_t init_size = 1024; + gc_cache->pc_stack = (void**)malloc(init_size * sizeof(void*)); + gc_cache->pc_stack_end = gc_cache->pc_stack + init_size; + gc_cache->data_stack = (char*)malloc(init_size * sizeof(gc_mark_data_t)); } // System-wide initializations @@ -2178,6 +3022,11 @@ void jl_gc_init(void) if (maxmem > max_collect_interval) max_collect_interval = maxmem; #endif + gc_mark_sp_t sp = {NULL, NULL, NULL, NULL}; + void *const *label_addrs = gc_mark_loop(NULL, sp); + for (int i = 0; i < gc_mark_nlabels; i++) { + gc_mark_label_addrs[i] = label_addrs[i]; + } } JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) diff --git a/src/gc.h b/src/gc.h index 48cc90547f229f..c512bb97203f0b 100644 --- a/src/gc.h +++ b/src/gc.h @@ -74,6 +74,128 @@ typedef struct { int full_sweep; } jl_gc_num_t; +typedef struct { + void **pc; // Current stack address for the pc (up growing) + char *data; // Current stack address for the data (up growing) + void **pc_start; // Cached value of `gc_cache->pc_stack` + void **pc_end; // Cached value of `gc_cache->pc_stack_end` +} gc_mark_sp_t; + +// Pop a data struct from the mark data stack (i.e. decrease the stack pointer) +// This should be used after dispatch and therefore the pc stack pointer is already popped from +// the stack. +STATIC_INLINE void *gc_pop_markdata_(gc_mark_sp_t *sp, size_t size) +{ + char *data = sp->data - size; + sp->data = data; + return data; +} +#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type))) + +// Re-push a frame to the mark stack (both data and pc) +// The data and pc are expected to be on the stack (or updated in place) already. +// Mainly useful to pause the current scanning in order to scan an new object. +STATIC_INLINE void *gc_repush_markdata_(gc_mark_sp_t *sp, size_t size) +{ + char *data = sp->data; + sp->pc++; + sp->data = data + size; + return data; +} +#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type))) + +/** + * The `nptr` member of marking data records the number of pointers slots referenced by + * an object to be used in the full collection heuristics as well as whether the object + * references young objects. + * `nptr >> 2` is the number of pointers fields referenced by the object. + * The lowest bit of `nptr` is set if the object references young object. + * The 2nd lowest bit of `nptr` is the GC old bits of the object after marking. + * A `0x3` in the low bits means that the object needs to be in the remset. + */ + +// An generic object that's marked and needs to be scanned +// The metadata might need update too (depend on the PC) +typedef struct { + jl_value_t *obj; // The object + uintptr_t tag; // The tag with the GC bits masked out + uint8_t bits; // The GC bits after tagging (`bits & 1 == 1`) +} gc_mark_marked_obj_t; + +// An object array. This can come from an array, svec, or the using array or a module +typedef struct { + jl_value_t *parent; // The parent object to trigger write barrier on. + jl_value_t **begin; // The first slot to be scanned. + jl_value_t **end; // The end address (after the last slot to be scanned) + uintptr_t nptr; // See notes about `nptr` above. +} gc_mark_objarray_t; + +// A normal object with 8bits field descriptors +typedef struct { + jl_value_t *parent; // The parent object to trigger write barrier on. + jl_fielddesc8_t *begin; // Current field descriptor. + jl_fielddesc8_t *end; // End of field descriptor. + uintptr_t nptr; // See notes about `nptr` above. +} gc_mark_obj8_t; + +// A normal object with 16bits field descriptors +typedef struct { + jl_value_t *parent; // The parent object to trigger write barrier on. + jl_fielddesc16_t *begin; // Current field descriptor. + jl_fielddesc16_t *end; // End of field descriptor. + uintptr_t nptr; // See notes about `nptr` above. +} gc_mark_obj16_t; + +// A normal object with 32bits field descriptors +typedef struct { + jl_value_t *parent; // The parent object to trigger write barrier on. + jl_fielddesc32_t *begin; // Current field descriptor. + jl_fielddesc32_t *end; // End of field descriptor. + uintptr_t nptr; // See notes about `nptr` above. +} gc_mark_obj32_t; + +// Stack frame +typedef struct { + jl_gcframe_t *s; // The current stack frame + size_t i; // The current slot index in the frame + size_t nroots; // `nroots` fields in the frame + // Parameters to mark the copy_stack range. + uintptr_t offset; + uintptr_t lb; + uintptr_t ub; +} gc_mark_stackframe_t; + +// Module bindings. This is also the beginning of module scanning. +// The loop will start marking other references in a module after the bindings are marked +typedef struct { + jl_module_t *parent; // The parent module to trigger write barrier on. + jl_binding_t **begin; // The first slot to be scanned. + jl_binding_t **end; // The end address (after the last slot to be scanned) + uintptr_t nptr; // See notes about `nptr` above. + uint8_t bits; // GC bits of the module (the bits to mark the binding buffer with) +} gc_mark_binding_t; + +// Finalizer list +typedef struct { + jl_value_t **begin; + jl_value_t **end; +} gc_mark_finlist_t; + +// This is used to determine the max size of the data objects on the data stack. +// We'll use this size to determine the size of the data stack corresponding to a +// PC stack size. Since the data objects are not all of the same size, we'll waste +// some memory on the data stack this way but that size is unlikely going to be significant. +typedef union { + gc_mark_marked_obj_t marked; + gc_mark_objarray_t objarray; + gc_mark_obj8_t obj8; + gc_mark_obj16_t obj16; + gc_mark_obj32_t obj32; + gc_mark_stackframe_t stackframe; + gc_mark_binding_t binding; + gc_mark_finlist_t finlist; +} gc_mark_data_t; + // layout for big (>2k) objects typedef struct _bigval_t { @@ -343,12 +465,26 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) *list = hdr; } +STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp) +{ + sp->pc = gc_cache->pc_stack; + sp->data = gc_cache->data_stack; + sp->pc_start = gc_cache->pc_stack; + sp->pc_end = gc_cache->pc_stack_end; +} + void mark_all_roots(jl_ptls_t ptls); void gc_mark_object_list(jl_ptls_t ptls, arraylist_t *list, size_t start); void visit_mark_stack(jl_ptls_t ptls); +void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, + arraylist_t *list, size_t start); +void *const *gc_mark_loop(jl_ptls_t ptls, gc_mark_sp_t sp); void gc_debug_init(void); void jl_mark_box_caches(jl_ptls_t ptls); +#define gc_mark_nlabels 9 +extern void *gc_mark_label_addrs[gc_mark_nlabels]; + // GC pages void jl_gc_init_page(void); @@ -461,10 +597,13 @@ extern int gc_verifying; #else #define gc_verify(ptls) #define verify_val(v) -#define verify_parent1(ty,obj,slot,arg1) -#define verify_parent2(ty,obj,slot,arg1,arg2) +#define verify_parent1(ty,obj,slot,arg1) do {} while (0) +#define verify_parent2(ty,obj,slot,arg1,arg2) do {} while (0) #define gc_verifying (0) #endif +int gc_slot_to_fieldidx(void *_obj, void *slot); +int gc_slot_to_arrayidx(void *_obj, void *begin); +NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, gc_mark_sp_t sp, int pc_offset); #ifdef GC_DEBUG_ENV JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; diff --git a/src/julia_threads.h b/src/julia_threads.h index 7d7c1275880072..1051d740c8b853 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -88,6 +88,10 @@ typedef struct { // this makes sure that a single objects can only appear once in // the lists (the mark bit cannot be flipped to `0` without sweeping) void *big_obj[1024]; + jl_mutex_t stack_lock; + void **pc_stack; + void **pc_stack_end; + char *data_stack; } jl_gc_mark_cache_t; // This includes all the thread local states we care about for a thread. From 2d19700e03d925d17e4b0eee05c4a2194790766c Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Sun, 23 Apr 2017 13:31:36 -0400 Subject: [PATCH 3/3] Use mark loop in GC --- src/gc-debug.c | 26 +-- src/gc.c | 454 +++++-------------------------------------------- src/gc.h | 4 +- 3 files changed, 64 insertions(+), 420 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index de30f58986e653..4eca440e684775 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -198,18 +198,21 @@ static void restore(void) static void gc_verify_track(jl_ptls_t ptls) { + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; do { + gc_mark_sp_t sp; + gc_mark_sp_init(gc_cache, &sp); arraylist_push(&lostval_parents_done, lostval); jl_printf(JL_STDERR, "Now looking for %p =======\n", lostval); clear_mark(GC_CLEAN); - mark_all_roots(ptls); - gc_mark_object_list(ptls, &to_finalize, 0); + gc_mark_queue_all_roots(ptls, &sp); + gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); for (int i = 0;i < jl_n_threads;i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_mark_object_list(ptls, &ptls2->finalizers, 0); + gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } - gc_mark_object_list(ptls, &finalizer_list_marked, 0); - visit_mark_stack(ptls); + gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); + gc_mark_loop(ptls, sp); if (lostval_parents.len == 0) { jl_printf(JL_STDERR, "Could not find the missing link. We missed a toplevel root. This is odd.\n"); break; @@ -243,19 +246,22 @@ static void gc_verify_track(jl_ptls_t ptls) void gc_verify(jl_ptls_t ptls) { + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + gc_mark_sp_t sp; + gc_mark_sp_init(gc_cache, &sp); lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; clear_mark(GC_CLEAN); gc_verifying = 1; - mark_all_roots(ptls); - gc_mark_object_list(ptls, &to_finalize, 0); + gc_mark_queue_all_roots(ptls, &sp); + gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); for (int i = 0;i < jl_n_threads;i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_mark_object_list(ptls, &ptls2->finalizers, 0); + gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } - gc_mark_object_list(ptls, &finalizer_list_marked, 0); - visit_mark_stack(ptls); + gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); + gc_mark_loop(ptls, sp); int clean_len = bits_save[GC_CLEAN].len; for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) { jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i]; diff --git a/src/gc.c b/src/gc.c index 732404523ccf9e..b0382b60db8ae3 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1256,25 +1256,6 @@ static void gc_sweep_pool(int sweep_full) // mark phase -static jl_value_t **mark_stack = NULL; -static jl_value_t **mark_stack_base = NULL; -static size_t mark_stack_size = 0; -static size_t mark_sp = 0; - -static void grow_mark_stack(void) -{ - size_t newsz = mark_stack_size>0 ? mark_stack_size*2 : 32000; - size_t offset = mark_stack - mark_stack_base; - mark_stack_base = (jl_value_t**)realloc(mark_stack_base, newsz*sizeof(void*)); - if (mark_stack_base == NULL) { - jl_printf(JL_STDERR, "Couldn't grow mark stack to : %" PRIuPTR "\n", - (uintptr_t)newsz); - exit(1); - } - mark_stack = mark_stack_base + offset; - mark_stack_size = newsz; -} - JL_DLLEXPORT void jl_gc_queue_root(jl_value_t *ptr) { jl_ptls_t ptls = jl_get_ptls_states(); @@ -1306,36 +1287,9 @@ void gc_queue_binding(jl_binding_t *bnd) arraylist_push(&ptls->heap.rem_bindings, bnd); } -static void gc_scan_obj_(jl_ptls_t ptls, jl_value_t *v, int d, - uintptr_t tag, uint8_t bits); -static void gc_mark_obj(jl_ptls_t ptls, jl_value_t *v, - uintptr_t tag, uint8_t bits); #ifdef JL_DEBUG_BUILD static void *volatile gc_findval; // for usage from gdb, for finding the gc-root for a value #endif -// Returns whether the object is young -STATIC_INLINE int gc_push_root(jl_ptls_t ptls, void *v, int d) // v isa jl_value_t* -{ -#ifdef JL_DEBUG_BUILD - if (v == gc_findval) - jl_raise_debugger(); -#endif - assert(v != NULL); - jl_taggedvalue_t *o = jl_astaggedvalue(v); - verify_val(v); - uintptr_t tag = o->header; - if (!gc_marked(tag)) { - uint8_t bits; - if (__likely(gc_setmark_tag(o, GC_MARKED, tag, &bits))) { - tag = tag & ~(uintptr_t)15; - if (!gc_verifying) - gc_mark_obj(ptls, (jl_value_t*)v, tag, bits); - gc_scan_obj_(ptls, (jl_value_t*)v, d, tag, bits); - } - return !gc_old(bits); - } - return !gc_old(tag); -} // TODO rename this as it is misleading now void jl_gc_setmark(jl_ptls_t ptls, jl_value_t *v) @@ -1350,43 +1304,6 @@ void jl_gc_setmark(jl_ptls_t ptls, jl_value_t *v) } } -NOINLINE static int gc_mark_module(jl_ptls_t ptls, jl_module_t *m, - int d, int8_t bits) -{ - size_t i; - int refyoung = 0; - void **table = m->bindings.table; - for(i=1; i < m->bindings.size; i+=2) { - if (table[i] != HT_NOTFOUND) { - jl_binding_t *b = (jl_binding_t*)table[i]; - gc_setmark_buf_(ptls, b, bits, sizeof(jl_binding_t)); - void *vb = jl_astaggedvalue(b); - verify_parent1("module", m, &vb, "binding_buff"); - (void)vb; - if (b->value != NULL) { - verify_parent2("module", m, &b->value, "binding(%s)", - jl_symbol_name(b->name)); - refyoung |= gc_push_root(ptls, b->value, d); - } - if (b->globalref != NULL) - refyoung |= gc_push_root(ptls, b->globalref, d); - } - } - // this is only necessary because bindings for "using" modules - // are added only when accessed. therefore if a module is replaced - // after "using" it but before accessing it, this array might - // contain the only reference. - for(i=0; i < m->usings.len; i++) { - refyoung |= gc_push_root(ptls, m->usings.items[i], d); - } - - if (m->parent) { - refyoung |= gc_push_root(ptls, m->parent, d); - } - - return refyoung; -} - // Handle the case where the stack is only partially copied. STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, uintptr_t lb, uintptr_t ub) @@ -1404,113 +1321,17 @@ STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset, return *(uintptr_t*)real_addr; } -static void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uintptr_t offset, - uintptr_t lb, uintptr_t ub, int d) -{ - while (s != NULL) { - jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2); - size_t nroots = gc_read_stack(&s->nroots, offset, lb, ub); - size_t nr = nroots >> 1; - if (nroots & 1) { - for (size_t i = 0; i < nr; i++) { - void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub); - void *obj = (void*)gc_read_stack(slot, offset, lb, ub); - if (obj != NULL) { - gc_push_root(ptls, obj, d); - } - } - } - else { - for (size_t i=0; i < nr; i++) { - void *obj = (void*)gc_read_stack(&rts[i], offset, lb, ub); - if (obj) { - gc_push_root(ptls, obj, d); - } - } - } - s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub); - } -} - -static void gc_mark_task_stack(jl_ptls_t ptls, jl_task_t *ta, int d, int8_t bits) -{ - gc_scrub_record_task(ta); - int stkbuf = (ta->stkbuf != (void*)(intptr_t)-1 && ta->stkbuf != NULL); - int16_t tid = ta->tid; - jl_ptls_t ptls2 = jl_all_tls_states[tid]; - if (stkbuf) { -#ifdef COPY_STACKS - gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->bufsz); -#else - // stkbuf isn't owned by julia for the root task - if (ta != ptls2->root_task) { - gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->ssize); - } -#endif - } - if (ta == ptls2->current_task) { - gc_mark_stack(ptls, ptls2->pgcstack, 0, 0, (uintptr_t)-1, d); - } - else if (stkbuf) { - uintptr_t offset = 0; - uintptr_t lb = 0; - uintptr_t ub = (uintptr_t)-1; -#ifdef COPY_STACKS - ub = (uintptr_t)ptls2->stackbase; - lb = ub - ta->ssize; - offset = (uintptr_t)ta->stkbuf - lb; -#endif - gc_mark_stack(ptls, ta->gcstack, offset, lb, ub, d); - } -} - -NOINLINE static void gc_mark_task(jl_ptls_t ptls, jl_task_t *ta, - int d, int8_t bits) -{ - if (ta->parent) gc_push_root(ptls, ta->parent, d); - gc_push_root(ptls, ta->tls, d); - gc_push_root(ptls, ta->consumers, d); - gc_push_root(ptls, ta->donenotify, d); - gc_push_root(ptls, ta->exception, d); - if (ta->backtrace) gc_push_root(ptls, ta->backtrace, d); - if (ta->start) gc_push_root(ptls, ta->start, d); - if (ta->result) gc_push_root(ptls, ta->result, d); - gc_mark_task_stack(ptls, ta, d, bits); -} - -void gc_mark_object_list(jl_ptls_t ptls, arraylist_t *list, size_t start) -{ - void **items = list->items; - size_t len = list->len; - for (size_t i = start;i < len;i++) { - void *v = items[i]; - if (__unlikely(!v)) - continue; - if (gc_ptr_tag(v, 1)) { - v = gc_ptr_clear_tag(v, 1); - i++; - assert(i < len); - } - gc_push_root(ptls, v, 0); - } -} - -JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_datatype_t *vt) +JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt, + gc_mark_sp_t sp) { jl_printf(JL_STDOUT, "GC error (probable corruption) :\n"); gc_debug_print_status(); jl_(vt); gc_debug_critical_error(); + gc_mark_loop_unwind(ptls, sp, 0); abort(); } -STATIC_INLINE void gc_assert_datatype(jl_datatype_t *vt) -{ - if (__likely(jl_is_datatype(vt))) - return; - gc_assert_datatype_fail(vt); -} - // This stores the label address in the mark loop function. // We can't directly store that to a global array so we need some hack to get that. // See the call to `gc_mark_loop` in init with a `NULL` `ptls`. @@ -2295,7 +2116,8 @@ mark: { goto pop; } else { - gc_assert_datatype(vt); + if (__unlikely(!jl_is_datatype(vt))) + gc_assert_datatype_fail(ptls, vt, sp); if (update_meta) gc_setmark(ptls, o, bits, jl_datatype_size(vt)); if (vt == jl_weakref_type) @@ -2350,236 +2172,50 @@ mark: { } } -#define MAX_MARK_DEPTH 400 -// Scan a marked object `v` and recursively mark its children. -// The object will be queued on the mark stack when recursion depth -// becomes too high. -// It does so assuming that the tag of the (marked) object is `tag`. -// If `v` is `GC_OLD_MARKED` and some of its children are `GC_MARKED` (young), -// `v` is added to the remset -static void gc_scan_obj_(jl_ptls_t ptls, jl_value_t *v, int d, - uintptr_t tag, uint8_t bits) -{ - assert(v != NULL); - assert(gc_marked(bits)); - jl_datatype_t *vt = (jl_datatype_t*)tag; -#ifdef JL_DEBUG_BUILD - gc_assert_datatype(vt); // should have checked in `gc_mark_obj` -#endif - int refyoung = 0, nptr = 0; - - assert(vt != jl_symbol_type); - // weakref should not be marked - if (vt == jl_weakref_type) - return; - // fast path - uint32_t npointers = vt->layout->npointers; - if (npointers == 0) - return; - d++; - if (d >= MAX_MARK_DEPTH) - goto queue_the_root; - - // some values have special representations - if (vt == jl_simplevector_type) { - size_t l = jl_svec_len(v); - jl_value_t **data = jl_svec_data(v); - nptr += l; - for(size_t i=0; i < l; i++) { - jl_value_t *elt = data[i]; - if (elt != NULL) { - verify_parent2("svec", v, &data[i], "elem(%d)", (int)i); - refyoung |= gc_push_root(ptls, elt, d); - } - } - } - else if (vt->name == jl_array_typename) { - jl_array_t *a = (jl_array_t*)v; - jl_array_flags_t flags = a->flags; - if (flags.how == 3) { - jl_value_t *owner = jl_array_data_owner(a); - refyoung |= gc_push_root(ptls, owner, d); - goto ret; - } - else if (flags.how == 1) { - void *val_buf = jl_astaggedvalue((char*)a->data - - a->offset * a->elsize); - verify_parent1("array", v, &val_buf, - "buffer ('loc' addr is meaningless)"); - (void)val_buf; - gc_setmark_buf_(ptls, (char*)a->data - a->offset * a->elsize, - bits, array_nbytes(a)); - } - if (flags.ptrarray && a->data != NULL) { - size_t l = jl_array_len(a); - if (l > 100000 && d > MAX_MARK_DEPTH-10) { - // don't mark long arrays at high depth, to try to avoid - // copying the whole array into the mark queue - goto queue_the_root; - } - else { - nptr += l; - void *data = a->data; - for (size_t i=0; i < l; i++) { - jl_value_t *elt = ((jl_value_t**)data)[i]; - if (elt != NULL) { - verify_parent2("array", v, &((jl_value_t**)data)[i], "elem(%d)", (int)i); - refyoung |= gc_push_root(ptls, elt, d); - } - } - } - } - } - else if (vt == jl_module_type) { - // should increase nptr here - refyoung |= gc_mark_module(ptls, (jl_module_t*)v, d, bits); - } - else if (vt == jl_task_type) { - // ditto nptr - gc_mark_task(ptls, (jl_task_t*)v, d, bits); - // tasks should always be remarked since we do not trigger the write barrier - // for stores to stack slots - refyoung = 1; - } - else { - int nf = (int)jl_datatype_nfields(vt); - nptr += (npointers & 0xff) << (npointers & 0x300); - uint32_t offsets = jl_datatype_layout_n_nonptr(vt->layout); - nf -= offsets & 0xffff; - for (int i = (offsets >> 16); i < nf; i++) { - if (jl_field_isptr(vt, i)) { - jl_value_t **slot = (jl_value_t**)((char*)v + jl_field_offset(vt, i)); - jl_value_t *fld = *slot; - if (fld) { - verify_parent2("object", v, slot, "field(%d)", i); - refyoung |= gc_push_root(ptls, fld, d); - } - } - } - } - -ret: - if ((bits == GC_OLD_MARKED) && refyoung && !gc_verifying) { - ptls->heap.remset_nptr += nptr; - // v is an old object referencing young objects - arraylist_push(ptls->heap.remset, v); - } - return; - -queue_the_root: - if (mark_sp >= mark_stack_size) - grow_mark_stack(); - mark_stack[mark_sp++] = (jl_value_t*)v; -} - -STATIC_INLINE void gc_scan_obj(jl_ptls_t ptls, jl_value_t *v, int d, - uintptr_t tag) -{ - gc_scan_obj_(ptls, v, d, tag & ~(uintptr_t)15, tag & 0xf); -} - -// Update the metadata of a marked object (without scanning it). -static void gc_mark_obj(jl_ptls_t ptls, jl_value_t *v, - uintptr_t tag, uint8_t bits) -{ - assert(v != NULL); - assert(gc_marked(bits)); - jl_taggedvalue_t *o = jl_astaggedvalue(v); - jl_datatype_t *vt = (jl_datatype_t*)(tag & ~(uintptr_t)15); - gc_assert_datatype(vt); - // Symbols are always marked - assert(vt != jl_symbol_type); - - // some values have special representations - if (vt == jl_simplevector_type) { - size_t l = jl_svec_len(v); - gc_setmark(ptls, o, bits, l * sizeof(void*) + sizeof(jl_svec_t)); - } - else if (vt->name == jl_array_typename) { - jl_array_t *a = (jl_array_t*)v; - jl_array_flags_t flags = a->flags; - if (flags.pooled) - gc_setmark_pool(ptls, o, bits); - else - gc_setmark_big(ptls, o, bits); - if (flags.how == 2) { - objprofile_count(jl_malloc_tag, bits == GC_OLD_MARKED, - array_nbytes(a)); - if (bits == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += array_nbytes(a); - } - else { - ptls->gc_cache.scanned_bytes += array_nbytes(a); - } - } - } - else if (vt == jl_module_type) { - gc_setmark(ptls, o, bits, sizeof(jl_module_t)); - } - else if (vt == jl_task_type) { - gc_setmark(ptls, o, bits, sizeof(jl_task_t)); - } - else if (vt == jl_string_type) { - gc_setmark(ptls, o, bits, jl_string_len(v) + sizeof(size_t) + 1); - } - else { - gc_setmark(ptls, o, bits, jl_datatype_size(vt)); - } -} - -void visit_mark_stack(jl_ptls_t ptls) -{ - while (mark_sp > 0 && !should_timeout()) { - jl_value_t *v = mark_stack[--mark_sp]; - assert(jl_astaggedvalue(v)->bits.gc); - gc_scan_obj(ptls, v, 0, jl_astaggedvalue(v)->header); - } - assert(!mark_sp); -} - extern jl_array_t *jl_module_init_order; extern jl_typemap_entry_t *call_cache[N_CALL_CACHE]; extern jl_array_t *jl_all_methods; -static void jl_gc_mark_thread_local(jl_ptls_t ptls, jl_ptls_t ptls2) +static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, + jl_ptls_t ptls2) { // `current_module` might not have a value when the thread is not // running. if (ptls2->current_module) - gc_push_root(ptls, ptls2->current_module, 0); - gc_push_root(ptls, ptls2->current_task, 0); - gc_push_root(ptls, ptls2->root_task, 0); - gc_push_root(ptls, ptls2->exception_in_transit, 0); + gc_mark_queue_obj(gc_cache, sp, ptls2->current_module); + gc_mark_queue_obj(gc_cache, sp, ptls2->current_task); + gc_mark_queue_obj(gc_cache, sp, ptls2->root_task); + gc_mark_queue_obj(gc_cache, sp, ptls2->exception_in_transit); } // mark the initial root set -static void mark_roots(jl_ptls_t ptls) +static void mark_roots(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp) { // modules - gc_push_root(ptls, jl_main_module, 0); - gc_push_root(ptls, jl_internal_main_module, 0); + gc_mark_queue_obj(gc_cache, sp, jl_main_module); + gc_mark_queue_obj(gc_cache, sp, jl_internal_main_module); // invisible builtin values if (jl_an_empty_vec_any != NULL) - gc_push_root(ptls, jl_an_empty_vec_any, 0); + gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any); if (jl_module_init_order != NULL) - gc_push_root(ptls, jl_module_init_order, 0); - gc_push_root(ptls, jl_cfunction_list.unknown, 0); - gc_push_root(ptls, jl_anytuple_type_type, 0); - gc_push_root(ptls, jl_ANY_flag, 0); + gc_mark_queue_obj(gc_cache, sp, jl_module_init_order); + gc_mark_queue_obj(gc_cache, sp, jl_cfunction_list.unknown); + gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type); + gc_mark_queue_obj(gc_cache, sp, jl_ANY_flag); for (size_t i = 0; i < N_CALL_CACHE; i++) if (call_cache[i]) - gc_push_root(ptls, call_cache[i], 0); + gc_mark_queue_obj(gc_cache, sp, call_cache[i]); if (jl_all_methods != NULL) - gc_push_root(ptls, jl_all_methods, 0); + gc_mark_queue_obj(gc_cache, sp, jl_all_methods); #ifndef COPY_STACKS - gc_push_root(ptls, jl_unprotect_stack_func, 0); + gc_mark_queue_obj(gc_cache, sp, jl_unprotect_stack_func); #endif // constants - gc_push_root(ptls, jl_typetype_type, 0); - gc_push_root(ptls, jl_emptytuple_type, 0); + gc_mark_queue_obj(gc_cache, sp, jl_typetype_type); + gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type); } // find unmarked objects that need to be finalized from the finalizer list "list". @@ -2713,14 +2349,12 @@ static void jl_gc_premark(jl_ptls_t ptls2) } } -static void jl_gc_mark_remset(jl_ptls_t ptls, jl_ptls_t ptls2) +static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, jl_ptls_t ptls2) { size_t len = ptls2->heap.last_remset->len; void **items = ptls2->heap.last_remset->items; - for (size_t i = 0; i < len; i++) { - jl_value_t *item = (jl_value_t*)items[i]; - gc_scan_obj(ptls, item, 0, jl_astaggedvalue(item)->header); - } + for (size_t i = 0; i < len; i++) + gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]); int n_bnd_refyoung = 0; len = ptls2->heap.rem_bindings.len; items = ptls2->heap.rem_bindings.items; @@ -2729,7 +2363,7 @@ static void jl_gc_mark_remset(jl_ptls_t ptls, jl_ptls_t ptls2) // A null pointer can happen here when the binding is cleaned up // as an exception is thrown after it was already queued (#10221) if (!ptr->value) continue; - if (gc_push_root(ptls, ptr->value, 0)) { + if (gc_mark_queue_obj(gc_cache, sp, ptr->value)) { items[n_bnd_refyoung] = ptr; n_bnd_refyoung++; } @@ -2750,9 +2384,12 @@ static void jl_gc_mark_ptrfree(jl_ptls_t ptls) // Only one thread should be running in this function static int _jl_gc_collect(jl_ptls_t ptls, int full) { + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + gc_mark_sp_t sp; + gc_mark_sp_init(gc_cache, &sp); + uint64_t t0 = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; - assert(mark_sp == 0); // 1. fix GC bits of objects in the remset. for (int t_i = 0; t_i < jl_n_threads; t_i++) @@ -2761,14 +2398,15 @@ static int _jl_gc_collect(jl_ptls_t ptls, int full) for (int t_i = 0; t_i < jl_n_threads; t_i++) { jl_ptls_t ptls2 = jl_all_tls_states[t_i]; // 2.1. mark every object in the `last_remsets` and `rem_binding` - jl_gc_mark_remset(ptls, ptls2); + jl_gc_queue_remset(gc_cache, &sp, ptls2); // 2.2. mark every thread local root - jl_gc_mark_thread_local(ptls, ptls2); + jl_gc_queue_thread_local(gc_cache, &sp, ptls2); } // 3. walk roots - mark_roots(ptls); - visit_mark_stack(ptls); + mark_roots(gc_cache, &sp); + gc_mark_loop(ptls, sp); + gc_mark_sp_init(gc_cache, &sp); gc_num.since_sweep += gc_num.allocd + (int64_t)gc_num.interval; gc_settime_premark_end(); gc_time_mark_pause(t0, scanned_bytes, perm_scanned_bytes); @@ -2790,19 +2428,20 @@ static int _jl_gc_collect(jl_ptls_t ptls, int full) } for (int i = 0;i < jl_n_threads;i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_mark_object_list(ptls, &ptls2->finalizers, 0); + gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } - gc_mark_object_list(ptls, &finalizer_list_marked, orig_marked_len); + gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); // "Flush" the mark stack before flipping the reset_age bit // so that the objects are not incorrectly resetted. - visit_mark_stack(ptls); + gc_mark_loop(ptls, sp); + gc_mark_sp_init(gc_cache, &sp); mark_reset_age = 1; // Reset the age and old bit for any unmarked objects referenced by the // `to_finalize` list. These objects are only reachable from this list // and should not be referenced by any old objects so this won't break // the GC invariant. - gc_mark_object_list(ptls, &to_finalize, 0); - visit_mark_stack(ptls); + gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); + gc_mark_loop(ptls, sp); mark_reset_age = 0; gc_settime_postmark_end(); @@ -2953,11 +2592,12 @@ JL_DLLEXPORT void jl_gc_collect(int full) } } -void mark_all_roots(jl_ptls_t ptls) +void gc_mark_queue_all_roots(jl_ptls_t ptls, gc_mark_sp_t *sp) { + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; for (size_t i = 0; i < jl_n_threads; i++) - jl_gc_mark_thread_local(ptls, jl_all_tls_states[i]); - mark_roots(ptls); + jl_gc_queue_thread_local(gc_cache, sp, jl_all_tls_states[i]); + mark_roots(gc_cache, sp); jl_gc_mark_ptrfree(ptls); } diff --git a/src/gc.h b/src/gc.h index c512bb97203f0b..8615b281730966 100644 --- a/src/gc.h +++ b/src/gc.h @@ -473,9 +473,7 @@ STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *s sp->pc_end = gc_cache->pc_stack_end; } -void mark_all_roots(jl_ptls_t ptls); -void gc_mark_object_list(jl_ptls_t ptls, arraylist_t *list, size_t start); -void visit_mark_stack(jl_ptls_t ptls); +void gc_mark_queue_all_roots(jl_ptls_t ptls, gc_mark_sp_t *sp); void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, arraylist_t *list, size_t start); void *const *gc_mark_loop(jl_ptls_t ptls, gc_mark_sp_t sp);