From dfab7be1f8bf8fd4400e7f61954b9596eacf4de0 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Tue, 24 Jan 2023 14:07:52 -0500 Subject: [PATCH] GC mark-loop rewrite (#47292) ## Previous work Since https://github.com/JuliaLang/julia/pull/21590, the GC mark-loop was implemented by keeping two manually managed stacks: one of which contained iterator states used to keep track of the object currently being marked. As an example, to mark arrays, we would pop the corresponding iterator state from the stack, iterate over the array until we found an unmarked reference, and if so, we would update the iterator state (to reflect the index we left off), "repush" it into the stack and proceed with marking the reference we just found. ## This PR This PR eliminates the need of keeping the iterator states by modifying the object graph traversal code. We keep a single stack of `jl_value_t *` currently being processed. To mark an object, we first pop it from the stack, push all unmarked references into the stack and proceed with marking. I believe this doesn't break any invariant from the generational GC. Indeed, the age bits are set after marking (if the object survived one GC cycle it's moved to the old generation), so this new traversal scheme wouldn't change the fact of whether an object had references to old objects or not. Furthermore, we must not update GC metadata for objects in the `remset`, and we ensure this by calling `gc_mark_outrefs` in `gc_queue_remset` with `meta_updated` set to 1. ## Additional advantages 1. There are no recursive function calls in the GC mark-loop code (one of the reasons why https://github.com/JuliaLang/julia/pull/21590 was implemented). 2. Keeping a single GC queue will **greatly** simplify work-stealing in the multi-threaded GC we are working on (c.f. https://github.com/JuliaLang/julia/pull/45639). 3. Arrays of references, for example, are now marked on a regular stride fashion, which could help with hardware prefetching. 4. We can easily modify the traversal mode (to breadth first, for example) by only changing the `jl_gc_markqueue_t`(from LIFO to FIFO, for example) methods without touching the mark-loop itself, which could enable further exploration on the GC in the future. Since this PR changes the mark-loop graph traversal code, there are some changes in the heap-snapshot, though I'm not familiar with that PR. Some benchmark results are here: https://hackmd.io/@Idnmfpb3SxK98-OsBtRD5A/H1V6QSzvs. --- src/gc-debug.c | 175 +--- src/gc.c | 1953 +++++++++++++++++------------------------- src/gc.h | 202 +---- src/julia_internal.h | 6 +- src/julia_threads.h | 21 +- src/partr.c | 2 +- src/support/dtypes.h | 1 + 7 files changed, 865 insertions(+), 1495 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index 011788fbc7b2e..a233b18d7dcfc 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -198,21 +198,23 @@ static void restore(void) static void gc_verify_track(jl_ptls_t ptls) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; do { - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t mq; + mq.current = mq.start = ptls->mark_queue.start; + mq.end = ptls->mark_queue.end; + mq.current_chunk = mq.chunk_start = ptls->mark_queue.chunk_start; + mq.chunk_end = ptls->mark_queue.chunk_end; arraylist_push(&lostval_parents_done, lostval); jl_safe_printf("Now looking for %p =======\n", lostval); clear_mark(GC_CLEAN); - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0; i < gc_n_threads; i++) { + gc_mark_queue_all_roots(ptls, &mq); + gc_mark_finlist(&mq, &to_finalize, 0); + for (int i = 0; i < gc_n_threads;i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_finlist(&mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(&mq, &finalizer_list_marked, 0); + gc_mark_loop_(ptls, &mq); if (lostval_parents.len == 0) { jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n"); break; @@ -246,22 +248,24 @@ static void gc_verify_track(jl_ptls_t ptls) void gc_verify(jl_ptls_t ptls) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t mq; + mq.current = mq.start = ptls->mark_queue.start; + mq.end = ptls->mark_queue.end; + mq.current_chunk = mq.chunk_start = ptls->mark_queue.chunk_start; + mq.chunk_end = ptls->mark_queue.chunk_end; lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; clear_mark(GC_CLEAN); gc_verifying = 1; - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0; i < gc_n_threads; i++) { + gc_mark_queue_all_roots(ptls, &mq); + gc_mark_finlist(&mq, &to_finalize, 0); + for (int i = 0; i < gc_n_threads;i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_finlist(&mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(&mq, &finalizer_list_marked, 0); + gc_mark_loop_(ptls, &mq); int clean_len = bits_save[GC_CLEAN].len; for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) { jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i]; @@ -500,7 +504,7 @@ int jl_gc_debug_check_other(void) return gc_debug_alloc_check(&jl_gc_debug_env.other); } -void jl_gc_debug_print_status(void) +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT { uint64_t pool_count = jl_gc_debug_env.pool.num; uint64_t other_count = jl_gc_debug_env.other.num; @@ -509,7 +513,7 @@ void jl_gc_debug_print_status(void) pool_count + other_count, pool_count, other_count, gc_num.pause); } -void jl_gc_debug_critical_error(void) +void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT { jl_gc_debug_print_status(); if (!jl_gc_debug_env.wait_for_debugger) @@ -1264,9 +1268,9 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT return (slot - start) / elsize; } -// Print a backtrace from the bottom (start) of the mark stack up to `sp` -// `pc_offset` will be added to `sp` for convenience in the debugger. -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset) +// Print a backtrace from the `mq->start` of the mark queue up to `mq->current` +// `offset` will be added to `mq->current` for convenience in the debugger. +NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset) { jl_jmp_buf *old_buf = jl_get_safe_restore(); jl_jmp_buf buf; @@ -1276,123 +1280,14 @@ NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_off jl_set_safe_restore(old_buf); return; } - void **top = sp.pc + pc_offset; - jl_gc_mark_data_t *data_top = sp.data; - sp.data = ptls->gc_cache.data_stack; - sp.pc = ptls->gc_cache.pc_stack; - int isroot = 1; - while (sp.pc < top) { - void *pc = *sp.pc; - const char *prefix = isroot ? "r--" : " `-"; - isroot = 0; - if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) { - gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Finalizer list from %p to %p\n", - (void*)data, (void*)data->begin, (void*)data->end); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) { - gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (void*)data->begin, (void*)data->end); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) { - gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) { - gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) { - gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) { - gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n", - (void*)data, prefix, (void*)data->s, (int)data->i, - (int)data->nroots >> 1, - (data->nroots & 1) ? "indirect" : "direct"); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) { - // module_binding - gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Module (bindings) %p -- [%p, %p)\n", - (void*)data, prefix, (void*)data->parent, - (void*)data->begin, (void*)data->end); - } - else { - jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc); - break; - } + jl_value_t **start = mq->start; + jl_value_t **end = mq->current + offset; + for (; start < end; start++) { + jl_value_t *obj = *start; + jl_taggedvalue_t *o = jl_astaggedvalue(obj); + jl_safe_printf("Queued object: %p :: (tag: %zu) (bits: %zu)\n", obj, + (uintptr_t)o->header, ((uintptr_t)o->header & 3)); + jl_((void*)(jl_datatype_t *)(o->header & ~(uintptr_t)0xf)); } jl_set_safe_restore(old_buf); } diff --git a/src/gc.c b/src/gc.c index fc2a4041910f5..d1cc57cf787de 100644 --- a/src/gc.c +++ b/src/gc.c @@ -112,17 +112,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); } -// Save/restore local mark stack to/from thread-local storage. - -STATIC_INLINE void export_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - ptls->gc_mark_sp = *sp; -} - -STATIC_INLINE void import_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - // Has the stack been reallocated in the meantime? - *sp = ptls->gc_mark_sp; -} - // Protect all access to `finalizer_list_marked` and `to_finalize`. // For accessing `ptls->finalizers`, the lock is needed if a thread // is going to realloc the buffer (of its own list) or accessing the @@ -208,16 +197,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) jl_wake_libuv(); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - // This acquire load pairs with the release stores - // in the signal handler of safepoint so we are sure that - // all the stores on those threads are visible. - // We're currently also using atomic store release in mutator threads - // (in jl_gc_state_set), but we may want to use signals to flush the - // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) - jl_cpu_pause(); // yield? + if (ptls2 != NULL) { + // This acquire load pairs with the release stores + // in the signal handler of safepoint so we are sure that + // all the stores on those threads are visible. + // We're currently also using atomic store release in mutator threads + // (in jl_gc_state_set), but we may want to use signals to flush the + // memory operations on those threads lazily instead. + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) + jl_cpu_pause(); // yield? + } } } @@ -522,7 +511,7 @@ void jl_gc_run_all_finalizers(jl_task_t *ct) schedule_all_finalizers(&finalizer_list_marked); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) schedule_all_finalizers(&ptls2->finalizers); } gc_n_threads = 0; @@ -596,7 +585,7 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); } finalize_object(&finalizer_list_marked, o, &copied_list, 0); @@ -636,7 +625,7 @@ static void gc_sweep_foreign_objs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); } } @@ -769,7 +758,7 @@ static void gc_sync_all_caches_nolock(jl_ptls_t ptls) assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2) + if (ptls2 != NULL) gc_sync_cache_nolock(ptls, &ptls2->gc_cache); } } @@ -788,21 +777,13 @@ STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr, ptls->gc_cache.nbig_obj = nobj + 1; } -// `gc_setmark_tag` can be called concurrently on multiple threads. -// In all cases, the function atomically sets the mark bits and returns -// the GC bits set as well as if the tag was unchanged by this thread. -// All concurrent calls on the same object are guaranteed to be setting the -// bits to the same value. -// For normal objects, this is the bits with only `GC_MARKED` changed to `1` -// For buffers, this is the bits of the owner object. -// For `mark_reset_age`, this is `GC_MARKED` with `GC_OLD` cleared. -// The return value is `1` if the object was not marked before. -// Returning `0` can happen if another thread marked it in parallel. -STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, - uintptr_t tag, uint8_t *bits) JL_NOTSAFEPOINT -{ - assert(!gc_marked(tag)); +// Atomically set the mark bit for object and return whether it was previously unmarked +FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT +{ assert(gc_marked(mark_mode)); + uintptr_t tag = o->header; + if (gc_marked(tag)) + return 0; if (mark_reset_age) { // Reset the object as if it was just allocated mark_mode = GC_MARKED; @@ -814,7 +795,6 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, tag = tag | mark_mode; assert((tag & 0x3) == mark_mode); } - *bits = mark_mode; tag = jl_atomic_exchange_relaxed((_Atomic(uintptr_t)*)&o->header, tag); verify_val(jl_valueof(o)); return !gc_marked(tag); @@ -897,15 +877,12 @@ STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o, STATIC_INLINE void gc_setmark_buf_(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL_NOTSAFEPOINT { jl_taggedvalue_t *buf = jl_astaggedvalue(o); - uintptr_t tag = buf->header; - if (gc_marked(tag)) - return; - uint8_t bits; + uint8_t bits = (gc_old(buf->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;; // If the object is larger than the max pool size it can't be a pool object. // This should be accurate most of the time but there might be corner cases // where the size estimate is a little off so we do a pool lookup to make // sure. - if (__likely(gc_setmark_tag(buf, mark_mode, tag, &bits)) && !gc_verifying) { + if (__likely(gc_try_setmark_tag(buf, mark_mode)) && !gc_verifying) { if (minsz <= GC_MAX_SZCLASS) { jl_gc_pagemeta_t *page = page_metadata(buf); if (page) { @@ -953,7 +930,7 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT jl_gc_queue_root(v); } -static inline void maybe_collect(jl_ptls_t ptls) +STATIC_INLINE void maybe_collect(jl_ptls_t ptls) { if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) { jl_gc_collect(JL_GC_AUTO); @@ -980,14 +957,14 @@ static void clear_weak_refs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - size_t n, l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; - for (n = 0; n < l; n++) { - jl_weakref_t *wr = (jl_weakref_t*)lst[n]; - if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) - wr->value = (jl_value_t*)jl_nothing; + if (ptls2 != NULL) { + size_t n, l = ptls2->heap.weak_refs.len; + void **lst = ptls2->heap.weak_refs.items; + for (n = 0; n < l; n++) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; + if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) + wr->value = (jl_value_t*)jl_nothing; + } } } } @@ -997,27 +974,27 @@ static void sweep_weak_refs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - size_t n = 0; - size_t ndel = 0; - size_t l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; - if (l == 0) - continue; - while (1) { - jl_weakref_t *wr = (jl_weakref_t*)lst[n]; - if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) - n++; - else - ndel++; - if (n >= l - ndel) - break; - void *tmp = lst[n]; - lst[n] = lst[n + ndel]; - lst[n + ndel] = tmp; + if (ptls2 != NULL) { + size_t n = 0; + size_t ndel = 0; + size_t l = ptls2->heap.weak_refs.len; + void **lst = ptls2->heap.weak_refs.items; + if (l == 0) + continue; + while (1) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; + if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) + n++; + else + ndel++; + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n + ndel] = tmp; + } + ptls2->heap.weak_refs.len -= ndel; } - ptls2->heap.weak_refs.len -= ndel; } } @@ -1025,7 +1002,7 @@ static void sweep_weak_refs(void) // big value list // Size includes the tag and the tag is not cleared!! -static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) +STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) { maybe_collect(ptls); size_t offs = offsetof(bigval_t, header); @@ -1057,7 +1034,6 @@ static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); - maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag); return val; } @@ -1118,9 +1094,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - sweep_big_list(sweep_full, &ptls2->heap.big_objects); + if (ptls2 != NULL) + sweep_big_list(sweep_full, &ptls2->heap.big_objects); } if (sweep_full) { bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); @@ -1189,7 +1164,7 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls = gc_all_tls_states[i]; - if (ptls) { + if (ptls != NULL) { memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); } @@ -1238,32 +1213,32 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - mallocarray_t *ma = ptls2->heap.mallocarrays; - mallocarray_t **pma = &ptls2->heap.mallocarrays; - while (ma != NULL) { - mallocarray_t *nxt = ma->next; - int bits = jl_astaggedvalue(ma->a)->bits.gc; - if (gc_marked(bits)) { - pma = &ma->next; - } - else { - *pma = nxt; - assert(ma->a->flags.how == 2); - jl_gc_free_array(ma->a); - ma->next = ptls2->heap.mafreelist; - ptls2->heap.mafreelist = ma; + if (ptls2 != NULL) { + mallocarray_t *ma = ptls2->heap.mallocarrays; + mallocarray_t **pma = &ptls2->heap.mallocarrays; + while (ma != NULL) { + mallocarray_t *nxt = ma->next; + int bits = jl_astaggedvalue(ma->a)->bits.gc; + if (gc_marked(bits)) { + pma = &ma->next; + } + else { + *pma = nxt; + assert(ma->a->flags.how == 2); + jl_gc_free_array(ma->a); + ma->next = ptls2->heap.mafreelist; + ptls2->heap.mafreelist = ma; + } + gc_time_count_mallocd_array(bits); + ma = nxt; } - gc_time_count_mallocd_array(bits); - ma = nxt; } } gc_time_mallocd_array_end(); } // pool allocation -static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT +STATIC_INLINE jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT { assert(GC_PAGE_OFFSET >= sizeof(void*)); pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; @@ -1310,7 +1285,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT } // Size includes the tag and the tag is not cleared!! -static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, +STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize) { // Use the pool offset instead of the pool address as the argument @@ -1328,7 +1303,7 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1); // first try to use the freelist jl_taggedvalue_t *v = p->freelist; - if (v) { + if (v != NULL) { jl_taggedvalue_t *next = v->next; p->freelist = next; if (__unlikely(gc_page_data(v) != gc_page_data(next))) { @@ -1348,8 +1323,8 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset // If there's no pages left or the current page is used up, // we need to use the slow path. char *cur_page = gc_page_data((char*)v - 1); - if (__unlikely(!v || cur_page + GC_PAGE_SZ < (char*)next)) { - if (v) { + if (__unlikely(v == NULL || cur_page + GC_PAGE_SZ < (char*)next)) { + if (v != NULL) { // like the freelist case, // but only update the page metadata when it is full jl_gc_pagemeta_t *pg = jl_assume(page_metadata((char*)v - 1)); @@ -1359,7 +1334,7 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset v = *(jl_taggedvalue_t**)cur_page; } // Not an else!! - if (!v) + if (v == NULL) v = add_page(p); next = (jl_taggedvalue_t*)((char*)v + osize); } @@ -1373,7 +1348,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize) { jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); - maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag); return val; } @@ -1519,7 +1493,7 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t } // the actual sweeping over all allocated pages in a memory pool -static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT { int p_n = pg->pool_n; int t_n = pg->thread_n; @@ -1530,7 +1504,7 @@ static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg } // sweep over a pagetable0 for all allocated pages -static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT { unsigned ub = 0; unsigned alloc = 0; @@ -1554,7 +1528,7 @@ static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *p } // sweep over pagetable1 for all pagetable0 that may contain allocated pages -static inline int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT { unsigned ub = 0; unsigned alloc = 0; @@ -1583,7 +1557,7 @@ static void sweep_pool_pagetable(jl_taggedvalue_t ***pfl, int sweep_full) JL_NOT { if (REGION2_PG_COUNT == 1) { // compile-time optimization pagetable1_t *pagetable1 = memory_map.meta1[0]; - if (pagetable1) + if (pagetable1 != NULL) sweep_pool_pagetable1(pfl, pagetable1, sweep_full); return; } @@ -1682,10 +1656,10 @@ static void gc_sweep_pool(int sweep_full) // null out terminal pointers of free lists for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - for (int i = 0; i < JL_GC_N_POOLS; i++) { - *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + if (ptls2 != NULL) { + for (int i = 0; i < JL_GC_N_POOLS; i++) { + *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + } } } @@ -1761,7 +1735,7 @@ static void *volatile gc_findval; // for usage from gdb, for finding the gc-root // Handle the case where the stack is only partially copied. STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, - uintptr_t lb, uintptr_t ub) + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { uintptr_t addr = (uintptr_t)_addr; if (addr >= lb && addr < ub) @@ -1770,901 +1744,548 @@ STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, } STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset, - uintptr_t lb, uintptr_t ub) + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { uintptr_t real_addr = gc_get_stack_addr(_addr, offset, lb, ub); return *(uintptr_t*)real_addr; } JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt, - jl_gc_mark_sp_t sp) + jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { jl_safe_printf("GC error (probable corruption) :\n"); jl_gc_debug_print_status(); jl_(vt); jl_gc_debug_critical_error(); - gc_mark_loop_unwind(ptls, sp, 0); + gc_mark_loop_unwind(ptls, mq, 0); abort(); } -// This stores the label address in the mark loop function. -// We can't directly store that to a global array so we need some hack to get that. -// See the call to `gc_mark_loop` in init with a `NULL` `ptls`. -void *gc_mark_label_addrs[_GC_MARK_L_MAX]; - -// Double the local mark stack (both pc and data) -static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) JL_NOTSAFEPOINT +// Check if `nptr` is tagged for `old + refyoung`, +// Push the object to the remset and update the `nptr` counter if necessary. +STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, + uintptr_t nptr) JL_NOTSAFEPOINT { - jl_gc_mark_data_t *old_data = gc_cache->data_stack; - void **pc_stack = sp->pc_start; - size_t stack_size = (char*)sp->pc_end - (char*)pc_stack; - ptrdiff_t datadiff = (char*)sp->data - (char*)old_data; - gc_cache->data_stack = (jl_gc_mark_data_t *)realloc_s(old_data, stack_size * 2 * sizeof(jl_gc_mark_data_t)); - sp->data = (jl_gc_mark_data_t *)((char*)gc_cache->data_stack + datadiff); - - sp->pc_start = gc_cache->pc_stack = (void**)realloc_s(pc_stack, stack_size * 2 * sizeof(void*)); - gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2; - sp->pc = sp->pc_start + (sp->pc - pc_stack); + if (__unlikely((nptr & 0x3) == 0x3)) { + ptls->heap.remset_nptr += nptr >> 2; + arraylist_t *remset = ptls->heap.remset; + size_t len = remset->len; + if (__unlikely(len >= remset->max)) { + arraylist_push(remset, obj); + } + else { + remset->len = len + 1; + remset->items[len] = obj; + } + } } -// Push a work item to the stack. The type of the work item is marked with `pc`. -// The data needed is in `data` and is of size `data_size`. -// If there isn't enough space on the stack, the stack will be resized with the stack -// lock held. The caller should invalidate any local cache of the stack addresses that's not -// in `gc_cache` or `sp` -// The `sp` will be updated on return if `inc` is true. -STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - void *pc, void *data, size_t data_size, int inc) JL_NOTSAFEPOINT +// Double the mark queue +static NOINLINE void gc_markqueue_resize(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - assert(data_size <= sizeof(jl_gc_mark_data_t)); - if (__unlikely(sp->pc == sp->pc_end)) - gc_mark_stack_resize(gc_cache, sp); - *sp->pc = pc; - memcpy(sp->data, data, data_size); - if (inc) { - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + data_size); - sp->pc++; - } + jl_value_t **old_start = mq->start; + size_t old_queue_size = (mq->end - mq->start); + size_t offset = (mq->current - old_start); + mq->start = (jl_value_t **)realloc_s(old_start, 2 * old_queue_size * sizeof(jl_value_t *)); + mq->current = (mq->start + offset); + mq->end = (mq->start + 2 * old_queue_size); } -// Check if the reference is non-NULL and atomically set the mark bit. -// Update `*nptr`, which is the `nptr` field of the parent item, if the object is young. -// Return the tag (with GC bits cleared) and the GC bits in `*ptag` and `*pbits`. -// Return whether the object needs to be scanned / have metadata updated. -STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr, - uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT +// Push a work item to the queue +STATIC_INLINE void gc_markqueue_push(jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT { - if (!obj) - return 0; - jl_taggedvalue_t *o = jl_astaggedvalue(obj); - uintptr_t tag = o->header; - if (!gc_marked(tag)) { - uint8_t bits; - int res = gc_setmark_tag(o, GC_MARKED, tag, &bits); - if (!gc_old(bits)) - *nptr = *nptr | 1; - *ptag = tag & ~(uintptr_t)0xf; - *pbits = bits; - return __likely(res); - } - else if (!gc_old(tag)) { - *nptr = *nptr | 1; - } - return 0; + if (__unlikely(mq->current == mq->end)) + gc_markqueue_resize(mq); + *mq->current = obj; + mq->current++; } -// Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`. -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - arraylist_t *list, size_t start) +// Pop from the mark queue +STATIC_INLINE jl_value_t *gc_markqueue_pop(jl_gc_markqueue_t *mq) { - size_t len = list->len; - if (len <= start) - return; - jl_value_t **items = (jl_value_t**)list->items; - gc_mark_finlist_t markdata = {items + start, items + len}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_finlist], - &markdata, sizeof(markdata), 1); + jl_value_t *obj = NULL; + if (mq->current != mq->start) { + mq->current--; + obj = *mq->current; + } + return obj; } -// Queue a object to be scanned. The object should already be marked and the GC metadata -// should already be updated for it. Only scanning of the object should be performed. -STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - jl_value_t *obj) +// Double the chunk queue +static NOINLINE void gc_chunkqueue_resize(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - jl_taggedvalue_t *o = jl_astaggedvalue(obj); - uintptr_t tag = o->header; - uint8_t bits = tag & 0xf; - tag = tag & ~(uintptr_t)0xf; - gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_scan_only], - &data, sizeof(data), 1); + jl_gc_chunk_t *old_start = mq->chunk_start; + size_t old_queue_size = (mq->chunk_end - mq->chunk_start); + size_t offset = (mq->current_chunk - old_start); + mq->chunk_start = (jl_gc_chunk_t *)realloc_s(old_start, 2 * old_queue_size * sizeof(jl_gc_chunk_t)); + mq->current_chunk = (mq->chunk_start + offset); + mq->chunk_end = (mq->chunk_start + 2 * old_queue_size); } -// Mark and queue a object to be scanned. -// The object will be marked atomically which can also happen concurrently. -// It will be queued if the object wasn't marked already (or concurrently by another thread) -// Returns whether the object is young. -STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, void *_obj) JL_NOTSAFEPOINT +// Push chunk `*c` into chunk queue +STATIC_INLINE void gc_chunkqueue_push(jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT { - jl_value_t *obj = (jl_value_t*)jl_assume(_obj); - uintptr_t nptr = 0; - uintptr_t tag = 0; - uint8_t bits = 0; - if (!gc_try_setmark(obj, &nptr, &tag, &bits)) - return (int)nptr; - gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_marked_obj], - &data, sizeof(data), 1); - return (int)nptr; + if (__unlikely(mq->current_chunk == mq->chunk_end)) + gc_chunkqueue_resize(mq); + *mq->current_chunk = *c; + mq->current_chunk++; } -int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj) +// Pop chunk from chunk queue +STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - return gc_mark_queue_obj(gc_cache, sp, obj); + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + if (mq->current_chunk != mq->chunk_start) { + mq->current_chunk--; + c = *mq->current_chunk; + } + return c; } -JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +// Enqueue an unmarked obj. last bit of `nptr` is set if `_obj` is young +STATIC_INLINE void gc_try_claim_and_push(jl_gc_markqueue_t *mq, void *_obj, + uintptr_t *nptr) JL_NOTSAFEPOINT { - return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj); + if (_obj == NULL) + return; + jl_value_t *obj = (jl_value_t *)jl_assume(_obj); + jl_taggedvalue_t *o = jl_astaggedvalue(obj); + if (!gc_old(o->header) && nptr) + *nptr |= 1; + if (gc_try_setmark_tag(o, GC_MARKED)) + gc_markqueue_push(mq, obj); } -JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, - jl_value_t **objs, size_t nobjs) +// Mark object with 8bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj8(jl_ptls_t ptls, char *obj8_parent, uint8_t *obj8_begin, + uint8_t *obj8_end, uintptr_t nptr) JL_NOTSAFEPOINT { - gc_mark_objarray_t data = { parent, objs, objs + nobjs, 1, - jl_astaggedvalue(parent)->bits.gc & 2 }; - gc_mark_stack_push(&ptls->gc_cache, &ptls->gc_mark_sp, - gc_mark_label_addrs[GC_MARK_L_objarray], - &data, sizeof(data), 1); + (void)jl_assume(obj8_begin < obj8_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj8_begin < obj8_end; obj8_begin++) { + slot = &((jl_value_t**)obj8_parent)[*obj8_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj8_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj8_parent, slot, (jl_datatype_t*)jl_typeof(obj8_parent))); + if (obj8_begin + 1 != obj8_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); + } + else { + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; + } + gc_heap_snapshot_record_object_edge((jl_value_t*)obj8_parent, slot); + } + } + gc_mark_push_remset(ptls, (jl_value_t *)obj8_parent, nptr); + return new_obj; } - -// Check if `nptr` is tagged for `old + refyoung`, -// Push the object to the remset and update the `nptr` counter if necessary. -STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) JL_NOTSAFEPOINT +// Mark object with 16bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj16(jl_ptls_t ptls, char *obj16_parent, uint16_t *obj16_begin, + uint16_t *obj16_end, uintptr_t nptr) JL_NOTSAFEPOINT { - if (__unlikely((nptr & 0x3) == 0x3)) { - ptls->heap.remset_nptr += nptr >> 2; - arraylist_t *remset = ptls->heap.remset; - size_t len = remset->len; - if (__unlikely(len >= remset->max)) { - arraylist_push(remset, obj); - } - else { - remset->len = len + 1; - remset->items[len] = obj; + (void)jl_assume(obj16_begin < obj16_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj16_begin < obj16_end; obj16_begin++) { + slot = &((jl_value_t **)obj16_parent)[*obj16_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj16_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj16_parent, slot, (jl_datatype_t*)jl_typeof(obj16_parent))); + gc_try_claim_and_push(mq, new_obj, &nptr); + if (obj16_begin + 1 != obj16_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); + } + else { + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; + } + gc_heap_snapshot_record_object_edge((jl_value_t*)obj16_parent, slot); } } + gc_mark_push_remset(ptls, (jl_value_t *)obj16_parent, nptr); + return new_obj; } -// Scan a dense array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_objarray_t *objary, - jl_value_t **begin, jl_value_t **end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +// Mark object with 32bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint32_t *obj32_begin, + uint32_t *obj32_end, uintptr_t nptr) JL_NOTSAFEPOINT { - (void)jl_assume(objary == (gc_mark_objarray_t*)sp->data); - for (; begin < end; begin += objary->step) { - *pnew_obj = *begin; - if (*pnew_obj) { - verify_parent2("obj array", objary->parent, begin, "elem(%d)", - gc_slot_to_arrayidx(objary->parent, begin)); - gc_heap_snapshot_record_array_edge(objary->parent, begin); - } - if (!gc_try_setmark(*pnew_obj, &objary->nptr, ptag, pbits)) - continue; - begin += objary->step; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - objary->begin = begin; - gc_repush_markdata(sp, gc_mark_objarray_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, objary->parent, objary->nptr); - } - return 1; - } - gc_mark_push_remset(ptls, objary->parent, objary->nptr); - return 0; -} - -// Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_array8_t *ary8, - jl_value_t **begin, jl_value_t **end, - uint8_t *elem_begin, uint8_t *elem_end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(ary8 == (gc_mark_array8_t*)sp->data); - size_t elsize = ((jl_array_t*)ary8->elem.parent)->elsize / sizeof(jl_value_t*); - for (; begin < end; begin += elsize) { - for (; elem_begin < elem_end; elem_begin++) { - jl_value_t **slot = &begin[*elem_begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("array", ary8->elem.parent, slot, "elem(%d)", - gc_slot_to_arrayidx(ary8->elem.parent, begin)); - gc_heap_snapshot_record_array_edge(ary8->elem.parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &ary8->elem.nptr, ptag, pbits)) - continue; - elem_begin++; - // Found an object to mark - if (elem_begin < elem_end) { - // Haven't done with this one yet. Update the content and push it back - ary8->elem.begin = elem_begin; - ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); + (void)jl_assume(obj32_begin < obj32_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj32_begin < obj32_end; obj32_begin++) { + slot = &((jl_value_t **)obj32_parent)[*obj32_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj32_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj32_parent, slot, (jl_datatype_t*)jl_typeof(obj32_parent))); + if (obj32_begin + 1 != obj32_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); } else { - begin += elsize; - if (begin < end) { - // Haven't done with this array yet. Reset the content and push it back - ary8->elem.begin = ary8->rebegin; - ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr); - } + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; + } + gc_heap_snapshot_record_object_edge((jl_value_t*)obj32_parent, slot); + } + } + return new_obj; +} + +// Mark object array +STATIC_INLINE void gc_mark_objarray(jl_ptls_t ptls, jl_value_t *obj_parent, jl_value_t **obj_begin, + jl_value_t **obj_end, uint32_t step, uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + // Decide whether need to chunk objary + (void)jl_assume(step > 0); + size_t nobjs = (obj_end - obj_begin) / step; + if (nobjs > MAX_REFS_AT_ONCE) { + jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, obj_begin + step * MAX_REFS_AT_ONCE, + obj_end, NULL, NULL, + step, nptr}; + gc_chunkqueue_push(mq, &c); + obj_end = obj_begin + step * MAX_REFS_AT_ONCE; + } + for (; obj_begin < obj_end; obj_begin += step) { + new_obj = *obj_begin; + if (new_obj != NULL) { + verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)", + gc_slot_to_arrayidx(obj_parent, obj_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(obj_parent, &new_obj); + } + } + gc_mark_push_remset(ptls, obj_parent, nptr); +} + +// Mark array with 8bit field descriptors +STATIC_INLINE void gc_mark_array8(jl_ptls_t ptls, jl_value_t *ary8_parent, jl_value_t **ary8_begin, + jl_value_t **ary8_end, uint8_t *elem_begin, uint8_t *elem_end, + uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + size_t elsize = ((jl_array_t *)ary8_parent)->elsize / sizeof(jl_value_t *); + // Decide whether need to chunk ary8 + size_t nrefs = (ary8_end - ary8_begin) / elsize; + if (nrefs > MAX_REFS_AT_ONCE) { + jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, ary8_begin + elsize * MAX_REFS_AT_ONCE, + ary8_end, elem_begin, elem_end, + 0, nptr}; + gc_chunkqueue_push(mq, &c); + ary8_end = ary8_begin + elsize * MAX_REFS_AT_ONCE; + } + for (; ary8_begin < ary8_end; ary8_begin += elsize) { + for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) { + new_obj = ary8_begin[*pindex]; + if (new_obj != NULL) { + verify_parent2("array", ary8_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary8_parent, ary8_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(ary8_parent, &new_obj); } - return 1; } - elem_begin = ary8->rebegin; - } - gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr); - return 0; -} - -// Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_array16_t *ary16, - jl_value_t **begin, jl_value_t **end, - uint16_t *elem_begin, uint16_t *elem_end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(ary16 == (gc_mark_array16_t*)sp->data); - size_t elsize = ((jl_array_t*)ary16->elem.parent)->elsize / sizeof(jl_value_t*); - for (; begin < end; begin += elsize) { - for (; elem_begin < elem_end; elem_begin++) { - jl_value_t **slot = &begin[*elem_begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("array", ary16->elem.parent, slot, "elem(%d)", - gc_slot_to_arrayidx(ary16->elem.parent, begin)); - gc_heap_snapshot_record_array_edge(ary16->elem.parent, slot); + } + gc_mark_push_remset(ptls, ary8_parent, nptr); +} + +// Mark array with 16bit field descriptors +STATIC_INLINE void gc_mark_array16(jl_ptls_t ptls, jl_value_t *ary16_parent, jl_value_t **ary16_begin, + jl_value_t **ary16_end, uint16_t *elem_begin, uint16_t *elem_end, + uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + size_t elsize = ((jl_array_t *)ary16_parent)->elsize / sizeof(jl_value_t *); + // Decide whether need to chunk ary16 + size_t nrefs = (ary16_end - ary16_begin) / elsize; + if (nrefs > MAX_REFS_AT_ONCE) { + jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, ary16_begin + elsize * MAX_REFS_AT_ONCE, + ary16_end, elem_begin, elem_end, + 0, nptr}; + gc_chunkqueue_push(mq, &c); + ary16_end = ary16_begin + elsize * MAX_REFS_AT_ONCE; + } + for (; ary16_begin < ary16_end; ary16_begin += elsize) { + for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) { + new_obj = ary16_begin[*pindex]; + if (new_obj != NULL) { + verify_parent2("array", ary16_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary16_parent, ary16_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(ary16_parent, &new_obj); } - if (!gc_try_setmark(*pnew_obj, &ary16->elem.nptr, ptag, pbits)) - continue; - elem_begin++; - // Found an object to mark - if (elem_begin < elem_end) { - // Haven't done with this one yet. Update the content and push it back - ary16->elem.begin = elem_begin; - ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); + } + } + gc_mark_push_remset(ptls, ary16_parent, nptr); +} + +// Mark chunk of large array +STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT +{ + switch (c->cid) { + case GC_objary_chunk: { + jl_value_t *obj_parent = c->parent; + jl_value_t **obj_begin = c->begin; + jl_value_t **obj_end = c->end; + uint32_t step = c->step; + uintptr_t nptr = c->nptr; + gc_mark_objarray(ptls, obj_parent, obj_begin, obj_end, step, + nptr); + break; + } + case GC_ary8_chunk: { + jl_value_t *ary8_parent = c->parent; + jl_value_t **ary8_begin = c->begin; + jl_value_t **ary8_end = c->end; + uint8_t *elem_begin = (uint8_t *)c->elem_begin; + uint8_t *elem_end = (uint8_t *)c->elem_end; + uintptr_t nptr = c->nptr; + gc_mark_array8(ptls, ary8_parent, ary8_begin, ary8_end, elem_begin, elem_end, + nptr); + break; + } + case GC_ary16_chunk: { + jl_value_t *ary16_parent = c->parent; + jl_value_t **ary16_begin = c->begin; + jl_value_t **ary16_end = c->end; + uint16_t *elem_begin = (uint16_t *)c->elem_begin; + uint16_t *elem_end = (uint16_t *)c->elem_end; + uintptr_t nptr = c->nptr; + gc_mark_array16(ptls, ary16_parent, ary16_begin, ary16_end, elem_begin, elem_end, + nptr); + break; + } + case GC_finlist_chunk: { + jl_value_t **fl_begin = c->begin; + jl_value_t **fl_end = c->end; + gc_mark_finlist_(mq, fl_begin, fl_end); + break; + } + default: { + // `empty-chunk` should be checked by caller + jl_safe_printf("GC internal error: chunk mismatch cid=%d\n", c->cid); + abort(); + } + } +} + +// Mark gc frame +STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroots, uintptr_t offset, + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + uint32_t nr = nroots >> 2; + while (1) { + jl_value_t ***rts = (jl_value_t ***)(((void **)s) + 2); + for (uint32_t i = 0; i < nr; i++) { + if (nroots & 1) { + void **slot = (void **)gc_read_stack(&rts[i], offset, lb, ub); + new_obj = (jl_value_t *)gc_read_stack(slot, offset, lb, ub); } else { - begin += elsize; - if (begin < end) { - // Haven't done with this array yet. Reset the content and push it back - ary16->elem.begin = ary16->rebegin; - ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr); + new_obj = (jl_value_t *)gc_read_stack(&rts[i], offset, lb, ub); + if (gc_ptr_tag(new_obj, 1)) { + // handle tagged pointers in finalizer list + new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1); + // skip over the finalizer fptr + i++; } + if (gc_ptr_tag(new_obj, 2)) + continue; + } + if (new_obj != NULL) { + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(s, new_obj); + } + } + jl_gcframe_t *sprev = (jl_gcframe_t *)gc_read_stack(&s->prev, offset, lb, ub); + if (sprev == NULL) + break; + gc_heap_snapshot_record_frame_to_frame_edge(s, sprev); + s = sprev; + uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); + assert(new_nroots <= UINT32_MAX); + nroots = (uint32_t)new_nroots; + nr = nroots >> 2; + } +} + +// Mark exception stack +STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, size_t itr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + while (itr > 0) { + size_t bt_size = jl_excstack_bt_size(excstack, itr); + jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); + for (size_t bt_index = 0; bt_index < bt_size; + bt_index += jl_bt_entry_size(bt_data + bt_index)) { + jl_bt_element_t *bt_entry = bt_data + bt_index; + if (jl_bt_is_native(bt_entry)) + continue; + // Found an extended backtrace entry: iterate over any + // GC-managed values inside. + size_t njlvals = jl_bt_num_jlvals(bt_entry); + for (size_t jlval_index = 0; jlval_index < njlvals; jlval_index++) { + new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index); + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj); } - return 1; } - elem_begin = ary16->rebegin; + // The exception comes last - mark it + new_obj = jl_excstack_exception(excstack, itr); + itr = jl_excstack_next(excstack, itr); + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj); } - gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr); - return 0; } +// Mark module binding +STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent, jl_binding_t **mb_begin, + jl_binding_t **mb_end, uintptr_t nptr, + uint8_t bits) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + for (; mb_begin < mb_end; mb_begin++) { + jl_binding_t *b = *mb_begin; + if (b == (jl_binding_t *)jl_nothing) + continue; + verify_parent1("module", mb_parent, mb_begin, "binding_buff"); + gc_try_claim_and_push(mq, b, &nptr); + } + jl_value_t *bindings = (jl_value_t *)jl_atomic_load_relaxed(&mb_parent->bindings); + gc_try_claim_and_push(mq, bindings, &nptr); + jl_value_t *bindingkeyset = (jl_value_t *)jl_atomic_load_relaxed(&mb_parent->bindingkeyset); + gc_try_claim_and_push(mq, bindingkeyset, &nptr); + gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr); + size_t nusings = mb_parent->usings.len; + if (nusings > 0) { + // this is only necessary because bindings for "using" modules + // are added only when accessed. therefore if a module is replaced + // after "using" it but before accessing it, this array might + // contain the only reference. + jl_value_t *obj_parent = (jl_value_t *)mb_parent; + jl_value_t **objary_begin = (jl_value_t **)mb_parent->usings.items; + jl_value_t **objary_end = objary_begin + nusings; + gc_mark_objarray(ptls, obj_parent, objary_begin, objary_end, 1, nptr); + } + else { + gc_mark_push_remset(ptls, (jl_value_t *)mb_parent, nptr); + } +} -// Scan an object with 8bits field descriptors. see `gc_mark_obj8_t` -STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj8_t *obj8, - char *parent, uint8_t *begin, uint8_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) { - (void)jl_assume(obj8 == (gc_mark_obj8_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &obj8->nptr, ptag, pbits)) + jl_value_t *new_obj; + // Decide whether need to chunk finlist + size_t nrefs = (fl_end - fl_begin); + if (nrefs > MAX_REFS_AT_ONCE) { + jl_gc_chunk_t c = {GC_finlist_chunk, NULL, fl_begin + MAX_REFS_AT_ONCE, fl_end, 0, 0, 0, 0}; + gc_chunkqueue_push(mq, &c); + fl_end = fl_begin + MAX_REFS_AT_ONCE; + } + for (; fl_begin < fl_end; fl_begin++) { + new_obj = *fl_begin; + if (__unlikely(!new_obj)) continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj8->begin = begin; - gc_repush_markdata(sp, gc_mark_obj8_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); + if (gc_ptr_tag(new_obj, 1)) { + new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1); + fl_begin++; + assert(fl_begin < fl_end); } - return 1; - } - gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); - return 0; -} - -// Scan an object with 16bits field descriptors. see `gc_mark_obj16_t` -STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj16_t *obj16, - char *parent, uint16_t *begin, uint16_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT -{ - (void)jl_assume(obj16 == (gc_mark_obj16_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &obj16->nptr, ptag, pbits)) + if (gc_ptr_tag(new_obj, 2)) continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj16->begin = begin; - gc_repush_markdata(sp, gc_mark_obj16_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); - } - return 1; - } - gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); - return 0; -} - -// Scan an object with 32bits field descriptors. see `gc_mark_obj32_t` -STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj32_t *obj32, - char *parent, uint32_t *begin, uint32_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(obj32 == (gc_mark_obj32_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &obj32->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj32->begin = begin; - gc_repush_markdata(sp, gc_mark_obj32_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); - } - return 1; + gc_try_claim_and_push(mq, new_obj, NULL); } - gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); - return 0; } -#if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_) -# define gc_mark_laddr(name) (&&name) -# define gc_mark_jmp(ptr) goto *(ptr) -#else -#define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name) -#define gc_mark_jmp(ptr) do { \ - switch ((int)(uintptr_t)ptr) { \ - case GC_MARK_L_marked_obj: \ - goto marked_obj; \ - case GC_MARK_L_scan_only: \ - goto scan_only; \ - case GC_MARK_L_finlist: \ - goto finlist; \ - case GC_MARK_L_objarray: \ - goto objarray; \ - case GC_MARK_L_array8: \ - goto array8; \ - case GC_MARK_L_array16: \ - goto array16; \ - case GC_MARK_L_obj8: \ - goto obj8; \ - case GC_MARK_L_obj16: \ - goto obj16; \ - case GC_MARK_L_obj32: \ - goto obj32; \ - case GC_MARK_L_stack: \ - goto stack; \ - case GC_MARK_L_excstack: \ - goto excstack; \ - case GC_MARK_L_module_binding: \ - goto module_binding; \ - default: \ - abort(); \ - } \ - } while (0) -#endif - -// This is the main marking loop. -// It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects. -// Instead of using the native stack, two stacks are manually maintained, -// one (fixed-size) pc stack which stores the return address and one (variable-size) -// data stack which stores the local variables needed by the scanning code. -// Using a manually maintained stack has a few advantages -// -// 1. We can resize the stack as we go and never worry about stack overflow -// This is especitally useful when enters the GC in a deep call stack. -// It also removes the very deep GC call stack in a profile. -// 2. We can minimize the number of local variables to save on the stack. -// This includes minimizing the sizes of the stack frames and only saving variables -// that have been changed before making "function calls" (i.e. `goto mark;`) -// 3. We can perform end-of-loop tail-call optimization for common cases. -// 4. The marking can be interrupted more easily since all the states are maintained -// in a well-defined format already. -// This will be useful if we want to have incremental marking again. -// 5. The frames can be stolen by another thread more easily and it is not necessary -// to copy works to be stolen to another queue. Useful for parallel marking. -// (Will still require synchronization in stack popping of course.) -// 6. A flat function (i.e. no or very few function calls) also give the compiler -// opportunity to keep more states in registers that doesn't have to be spilled as often. -// -// We use two stacks so that the thief on another thread can steal the fixed sized pc stack -// and use that to figure out the size of the struct on the variable size data stack. -// -// The main disadvantages are that we bypass some stack-based CPU optimizations including the -// stack engine and return address prediction. -// Using two stacks also double the number of operations on the stack pointer -// though we still only need to use one of them (the pc stack pointer) for bounds check. -// In general, it seems that the reduction of stack memory ops and instructions count -// have a larger positive effect on the performance. =) - -// As a general guide we do not want to make non-inlined function calls in this function -// if possible since a large number of registers has to be spilled when that happens. -// This is especially true on on X86 which doesn't have many (any?) -// callee saved general purpose registers. -// (OTOH, the spill will likely make use of the stack engine which is otherwise idle so -// the performance impact is minimum as long as it's not in the hottest path) - -// There are three external entry points to the loop, corresponding to label -// `marked_obj`, `scan_only` and `finlist` (see the corresponding functions -// `gc_mark_queue_obj`, `gc_mark_queue_scan_obj` and `gc_mark_queue_finlist` above). -// The scanning of the object starts with `goto mark`, which updates the metadata and scans -// the object whose information is stored in `new_obj`, `tag` and `bits`. -// The branches in `mark` will dispatch the object to one of the scan "loop"s to be scanned -// as either a normal julia object or one of the special objects with specific storage format. -// Each of the scan "loop" will perform a DFS of the object in the following way -// -// 1. When encountering an pointer (julia object reference) slots, load, perform NULL check -// and atomically set the mark bits to determine if the object needs to be scanned. -// 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed) -// using `gc_repush_markdata` to increment the stack pointers. -// This step can also be replaced by a tail call by finishing up the marking of the current -// object when the end of the current object is reached. -// 3. Jump to `mark`. The marking of the current object will be resumed after the child is -// scanned by popping the stack frame back. -// -// Some of the special object scannings use BFS to simplify the code (Task and Module). - -// The jumps from the dispatch to the scan "loop"s are done by first pushing a frame -// to the stacks while only increment the data stack pointer before jumping to the loop -// This way the scan "loop" gets exactly what it expects after a stack pop. -// Additional optimizations are done for some of the common cases by skipping -// the unnecessary data stack pointer increment and the load from the stack -// (i.e. store to load forwarding). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`. -JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp) -{ - if (__unlikely(ptls == NULL)) { - gc_mark_label_addrs[GC_MARK_L_marked_obj] = gc_mark_laddr(marked_obj); - gc_mark_label_addrs[GC_MARK_L_scan_only] = gc_mark_laddr(scan_only); - gc_mark_label_addrs[GC_MARK_L_finlist] = gc_mark_laddr(finlist); - gc_mark_label_addrs[GC_MARK_L_objarray] = gc_mark_laddr(objarray); - gc_mark_label_addrs[GC_MARK_L_array8] = gc_mark_laddr(array8); - gc_mark_label_addrs[GC_MARK_L_array16] = gc_mark_laddr(array16); - gc_mark_label_addrs[GC_MARK_L_obj8] = gc_mark_laddr(obj8); - gc_mark_label_addrs[GC_MARK_L_obj16] = gc_mark_laddr(obj16); - gc_mark_label_addrs[GC_MARK_L_obj32] = gc_mark_laddr(obj32); - gc_mark_label_addrs[GC_MARK_L_stack] = gc_mark_laddr(stack); - gc_mark_label_addrs[GC_MARK_L_excstack] = gc_mark_laddr(excstack); - gc_mark_label_addrs[GC_MARK_L_module_binding] = gc_mark_laddr(module_binding); - return; - } - - jl_value_t *new_obj = NULL; - uintptr_t tag = 0; - uint8_t bits = 0; - int meta_updated = 0; - - gc_mark_objarray_t *objary; - jl_value_t **objary_begin; - jl_value_t **objary_end; - - gc_mark_array8_t *ary8; - gc_mark_array16_t *ary16; - - gc_mark_obj8_t *obj8; - char *obj8_parent; - uint8_t *obj8_begin; - uint8_t *obj8_end; - - gc_mark_obj16_t *obj16; - char *obj16_parent; - uint16_t *obj16_begin; - uint16_t *obj16_end; - -pop: - if (sp.pc == sp.pc_start) { - // TODO: stealing form another thread +// Mark finalizer list (or list of objects following same format) +void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) +{ + size_t len = list->len; + if (len <= start) return; - } - sp.pc--; - gc_mark_jmp(*sp.pc); // computed goto - -marked_obj: { - // An object that has been marked and needs have metadata updated and scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); - new_obj = obj->obj; - tag = obj->tag; - bits = obj->bits; - goto mark; - } - -scan_only: { - // An object that has been marked and needs to be scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); - new_obj = obj->obj; - tag = obj->tag; - bits = obj->bits; - meta_updated = 1; - goto mark; - } - -objarray: - objary = gc_pop_markdata(&sp, gc_mark_objarray_t); - objary_begin = objary->begin; - objary_end = objary->end; -objarray_loaded: - if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -array8: - ary8 = gc_pop_markdata(&sp, gc_mark_array8_t); - objary_begin = ary8->begin; - objary_end = ary8->end; - obj8_begin = ary8->elem.begin; - obj8_end = ary8->elem.end; -array8_loaded: - if (gc_mark_scan_array8(ptls, &sp, ary8, objary_begin, objary_end, obj8_begin, obj8_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -array16: - ary16 = gc_pop_markdata(&sp, gc_mark_array16_t); - objary_begin = ary16->begin; - objary_end = ary16->end; - obj16_begin = ary16->elem.begin; - obj16_end = ary16->elem.end; -array16_loaded: - if (gc_mark_scan_array16(ptls, &sp, ary16, objary_begin, objary_end, obj16_begin, obj16_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj8: - obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t); - obj8_parent = (char*)obj8->parent; - obj8_begin = obj8->begin; - obj8_end = obj8->end; -obj8_loaded: - if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj16: - obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t); - obj16_parent = (char*)obj16->parent; - obj16_begin = obj16->begin; - obj16_end = obj16->end; -obj16_loaded: - if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj32: { - gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t); - char *parent = (char*)obj32->parent; - uint32_t *begin = obj32->begin; - uint32_t *end = obj32->end; - if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits)) - goto mark; - goto pop; - } - -stack: { - // Scan the stack. see `gc_mark_stackframe_t` - // The task object this stack belongs to is being scanned separately as a normal - // 8bit field descriptor object. - gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t); - jl_gcframe_t *s = stack->s; - uint32_t i = stack->i; - uint32_t nroots = stack->nroots; - uintptr_t offset = stack->offset; - uintptr_t lb = stack->lb; - uintptr_t ub = stack->ub; - uint32_t nr = nroots >> 2; - uintptr_t nptr = 0; - while (1) { - jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2); - for (; i < nr; i++) { - if (nroots & 1) { - void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub); - new_obj = (jl_value_t*)gc_read_stack(slot, offset, lb, ub); - } - else { - new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub); - if (gc_ptr_tag(new_obj, 1)) { - // handle tagged pointers in finalizer list - new_obj = gc_ptr_clear_tag(new_obj, 1); - // skip over the finalizer fptr - i++; - } - if (gc_ptr_tag(new_obj, 2)) - continue; - } - if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) - continue; - gc_heap_snapshot_record_frame_to_object_edge(s, new_obj); - i++; - if (i < nr) { - // Haven't done with this one yet. Update the content and push it back - stack->i = i; - gc_repush_markdata(&sp, gc_mark_stackframe_t); - } - // TODO stack addresses needs copy stack handling - else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) { - gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s); - stack->s = s; - stack->i = 0; - uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); - assert(new_nroots <= UINT32_MAX); - stack->nroots = (uint32_t)new_nroots; - gc_repush_markdata(&sp, gc_mark_stackframe_t); - } - goto mark; - } - s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub); - // walk up one stack frame - if (s != 0) { - gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s); - stack->s = s; - i = 0; - uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); - assert(new_nroots <= UINT32_MAX); - nroots = stack->nroots = (uint32_t)new_nroots; - nr = nroots >> 2; - continue; - } - goto pop; - } - } + jl_value_t **fl_begin = (jl_value_t **)list->items + start; + jl_value_t **fl_end = (jl_value_t **)list->items + len; + gc_mark_finlist_(mq, fl_begin, fl_end); +} -excstack: { - // Scan an exception stack - gc_mark_excstack_t *stackitr = gc_pop_markdata(&sp, gc_mark_excstack_t); - jl_excstack_t *excstack = stackitr->s; - size_t itr = stackitr->itr; - size_t bt_index = stackitr->bt_index; - size_t jlval_index = stackitr->jlval_index; - while (itr > 0) { - size_t bt_size = jl_excstack_bt_size(excstack, itr); - jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); - for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) { - jl_bt_element_t *bt_entry = bt_data + bt_index; - if (jl_bt_is_native(bt_entry)) - continue; - // Found an extended backtrace entry: iterate over any - // GC-managed values inside. - size_t njlvals = jl_bt_num_jlvals(bt_entry); - while (jlval_index < njlvals) { - new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index); - gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj); - uintptr_t nptr = 0; - jlval_index += 1; - if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) { - stackitr->itr = itr; - stackitr->bt_index = bt_index; - stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); - goto mark; - } - } - jlval_index = 0; - } - // The exception comes last - mark it - new_obj = jl_excstack_exception(excstack, itr); - gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj); - itr = jl_excstack_next(excstack, itr); - bt_index = 0; - jlval_index = 0; - uintptr_t nptr = 0; - if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) { - stackitr->itr = itr; - stackitr->bt_index = bt_index; - stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); - goto mark; - } - } - goto pop; - } - -module_binding: { - // Scan a module. see `gc_mark_binding_t` - // Other fields of the module will be scanned after the bindings are scanned - gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t); - jl_binding_t **begin = binding->begin; - jl_binding_t **end = binding->end; - for (; begin < end; begin++) { - jl_binding_t *b = *begin; - if (b == (jl_binding_t*)jl_nothing) - continue; - verify_parent1("module", binding->parent, begin, "binding_buff"); - // Record the size used for the box for non-const bindings - gc_heap_snapshot_record_module_to_binding(binding->parent, b); - if (gc_try_setmark((jl_value_t*)b, &binding->nptr, &tag, &bits)) { - begin++; - binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); - new_obj = (jl_value_t*)b; - goto mark; - } - } - binding->begin = begin; - jl_module_t *m = binding->parent; - jl_value_t *bindings = (jl_value_t*)jl_atomic_load_relaxed(&m->bindings); - if (gc_try_setmark(bindings, &binding->nptr, &tag, &bits)) { - gc_repush_markdata(&sp, gc_mark_binding_t); - new_obj = (jl_value_t*)bindings; - goto mark; - } - jl_value_t *bindingkeyset = (jl_value_t*)jl_atomic_load_relaxed(&m->bindingkeyset); - if (gc_try_setmark(bindingkeyset, &binding->nptr, &tag, &bits)) { - gc_repush_markdata(&sp, gc_mark_binding_t); - new_obj = bindingkeyset; - goto mark; - } - int scanparent = gc_try_setmark((jl_value_t*)m->parent, &binding->nptr, &tag, &bits); - size_t nusings = m->usings.len; - if (nusings) { - // this is only necessary because bindings for "using" modules - // are added only when accessed. therefore if a module is replaced - // after "using" it but before accessing it, this array might - // contain the only reference. - objary_begin = (jl_value_t**)m->usings.items; - objary_end = objary_begin + nusings; - gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, 1, binding->nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &data, sizeof(data), 0); - // gc_mark_scan_objarray will eventually handle the remset for m - if (!scanparent) { - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; - } - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(data)); - sp.pc++; - } - else { - // done with m - gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr); - } - if (scanparent) { - new_obj = (jl_value_t*)m->parent; - goto mark; - } - goto pop; - } +JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +{ + int may_claim = gc_try_setmark_tag(jl_astaggedvalue(obj), GC_MARKED); + if (may_claim) + gc_markqueue_push(&ptls->mark_queue, obj); + return may_claim; +} -finlist: { - // Scan a finalizer (or format compatible) list. see `gc_mark_finlist_t` - gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t); - jl_value_t **begin = finlist->begin; - jl_value_t **end = finlist->end; - for (; begin < end; begin++) { - new_obj = *begin; - if (__unlikely(!new_obj)) - continue; - if (gc_ptr_tag(new_obj, 1)) { - new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1); - begin++; - assert(begin < end); - } - if (gc_ptr_tag(new_obj, 2)) - continue; - uintptr_t nptr = 0; - if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - finlist->begin = begin; - gc_repush_markdata(&sp, gc_mark_finlist_t); - } - goto mark; - } - goto pop; - } +JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, + jl_value_t **objs, size_t nobjs) +{ + uintptr_t nptr = (nobjs << 2) & (jl_astaggedvalue(parent)->bits.gc & 3); + gc_mark_objarray(ptls, parent, objs, objs + nobjs, 1, nptr); +} -mark: { - // Generic scanning entry point. - // Expects `new_obj`, `tag` and `bits` to be set correctly. -#ifdef JL_DEBUG_BUILD +// Enqueue and mark all outgoing references from `new_obj` which have not been marked +// yet. `meta_updated` is mostly used to make sure we don't update metadata twice for +// objects which have been enqueued into the `remset` +FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj, + int meta_updated) +{ + jl_value_t *new_obj = (jl_value_t *)_new_obj; + mark_obj: { + #ifdef JL_DEBUG_BUILD if (new_obj == gc_findval) jl_raise_debugger(); -#endif + #endif jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); - jl_datatype_t *vt = (jl_datatype_t*)tag; - int foreign_alloc = 0; + jl_datatype_t *vt = (jl_datatype_t *)(o->header & ~(uintptr_t)0xf); + uint8_t bits = (gc_old(o->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED; int update_meta = __likely(!meta_updated && !gc_verifying); + int foreign_alloc = 0; if (update_meta && jl_object_in_image(new_obj)) { foreign_alloc = 1; update_meta = 0; } - meta_updated = 0; // Symbols are always marked assert(vt != jl_symbol_type); if (vt == jl_simplevector_type) { size_t l = jl_svec_len(new_obj); jl_value_t **data = jl_svec_data(new_obj); - size_t dtsz = l * sizeof(void*) + sizeof(jl_svec_t); + size_t dtsz = l * sizeof(void *) + sizeof(jl_svec_t); if (update_meta) gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = data; + jl_value_t **objary_end = data + l; + uint32_t step = 1; uintptr_t nptr = (l << 2) | (bits & GC_OLD); - objary_begin = data; - objary_end = data + l; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (vt->name == jl_array_typename) { - jl_array_t *a = (jl_array_t*)new_obj; + jl_array_t *a = (jl_array_t *)new_obj; jl_array_flags_t flags = a->flags; if (update_meta) { if (flags.pooled) @@ -2672,9 +2293,10 @@ mark: { else gc_setmark_big(ptls, o, bits); } - else if (foreign_alloc) + else if (foreign_alloc) { objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_array_t)); - if (flags.how ==0){ + } + if (flags.how == 0) { void *data_ptr = (char*)a + sizeof(jl_array_t) +jl_array_ndimwords(a->flags.ndims) * sizeof(size_t); gc_heap_snapshot_record_hidden_edge(new_obj, data_ptr, jl_array_nbytes(a), 2); } @@ -2702,103 +2324,83 @@ mark: { else if (flags.how == 3) { jl_value_t *owner = jl_array_data_owner(a); uintptr_t nptr = (1 << 2) | (bits & GC_OLD); + gc_try_claim_and_push(mq, owner, &nptr); gc_heap_snapshot_record_internal_array_edge(new_obj, owner); - int markowner = gc_try_setmark(owner, &nptr, &tag, &bits); gc_mark_push_remset(ptls, new_obj, nptr); - if (markowner) { - new_obj = owner; - goto mark; - } - goto pop; + return; } - if (a->data == NULL || jl_array_len(a) == 0) - goto pop; + if (!a->data || jl_array_len(a) == 0) + return; if (flags.ptrarray) { - if ((jl_datatype_t*)jl_tparam0(vt) == jl_symbol_type) - goto pop; + if ((jl_datatype_t *)jl_tparam0(vt) == jl_symbol_type) + return; size_t l = jl_array_len(a); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = (jl_value_t **)a->data; + jl_value_t **objary_end = objary_begin + l; + uint32_t step = 1; uintptr_t nptr = (l << 2) | (bits & GC_OLD); - objary_begin = (jl_value_t**)a->data; - objary_end = objary_begin + l; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (flags.hasptr) { - jl_datatype_t *et = (jl_datatype_t*)jl_tparam0(vt); + jl_datatype_t *et = (jl_datatype_t *)jl_tparam0(vt); const jl_datatype_layout_t *layout = et->layout; unsigned npointers = layout->npointers; - unsigned elsize = a->elsize / sizeof(jl_value_t*); + unsigned elsize = a->elsize / sizeof(jl_value_t *); size_t l = jl_array_len(a); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = (jl_value_t **)a->data; + jl_value_t **objary_end = objary_begin + l * elsize; + uint32_t step = elsize; uintptr_t nptr = ((l * npointers) << 2) | (bits & GC_OLD); - objary_begin = (jl_value_t**)a->data; - objary_end = objary_begin + l * elsize; if (npointers == 1) { // TODO: detect anytime time stride is uniform? objary_begin += layout->first_ptr; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, elsize, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (layout->fielddesc_type == 0) { - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; - gc_mark_array8_t markdata = {objary_begin, objary_end, obj8_begin, {new_obj, obj8_begin, obj8_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array8), - &markdata, sizeof(markdata), 0); - ary8 = (gc_mark_array8_t*)sp.data; - goto array8_loaded; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; + gc_mark_array8(ptls, objary_parent, objary_begin, objary_end, obj8_begin, + obj8_end, nptr); } else if (layout->fielddesc_type == 1) { - obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); - obj16_end = obj16_begin + npointers; - gc_mark_array16_t markdata = {objary_begin, objary_end, obj16_begin, {new_obj, obj16_begin, obj16_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array16), - &markdata, sizeof(markdata), 0); - ary16 = (gc_mark_array16_t*)sp.data; - goto array16_loaded; + uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout); + uint16_t *obj16_end = obj16_begin + npointers; + gc_mark_array16(ptls, objary_parent, objary_begin, objary_end, obj16_begin, + obj16_end, nptr); } else { assert(0 && "unimplemented"); } } - goto pop; } else if (vt == jl_module_type) { if (update_meta) gc_setmark(ptls, o, bits, sizeof(jl_module_t)); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_module_t)); - jl_module_t *m = (jl_module_t*)new_obj; - jl_svec_t *bindings = jl_atomic_load_relaxed(&m->bindings); + jl_module_t *mb_parent = (jl_module_t *)new_obj; + jl_svec_t *bindings = jl_atomic_load_relaxed(&mb_parent->bindings); jl_binding_t **table = (jl_binding_t**)jl_svec_data(bindings); size_t bsize = jl_svec_len(bindings); - uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD); - gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); - goto module_binding; + uintptr_t nptr = ((bsize + mb_parent->usings.len + 1) << 2) | (bits & GC_OLD); + jl_binding_t **mb_begin = table + 1; + jl_binding_t **mb_end = table + bsize; + gc_mark_module_binding(ptls, mb_parent, mb_begin, mb_end, nptr, bits); } else if (vt == jl_task_type) { if (update_meta) gc_setmark(ptls, o, bits, sizeof(jl_task_t)); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_task_t)); - jl_task_t *ta = (jl_task_t*)new_obj; + jl_task_t *ta = (jl_task_t *)new_obj; gc_scrub_record_task(ta); if (gc_cblist_task_scanner) { - export_gc_state(ptls, &sp); int16_t tid = jl_atomic_load_relaxed(&ta->tid); - gc_invoke_callbacks(jl_gc_cb_task_scanner_t, - gc_cblist_task_scanner, - (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); - import_gc_state(ptls, &sp); + gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner, + (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); } -#ifdef COPY_STACKS + #ifdef COPY_STACKS void *stkbuf = ta->stkbuf; if (stkbuf && ta->copy_stack) { gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz); @@ -2807,14 +2409,14 @@ mark: { // TODO: edge to stack data // TODO: synthetic node for stack data (how big is it?) } -#endif + #endif jl_gcframe_t *s = ta->gcstack; size_t nroots; uintptr_t offset = 0; uintptr_t lb = 0; uintptr_t ub = (uintptr_t)-1; -#ifdef COPY_STACKS - if (stkbuf && ta->copy_stack && ta->ptls == NULL) { + #ifdef COPY_STACKS + if (stkbuf && ta->copy_stack && !ta->ptls) { int16_t tid = jl_atomic_load_relaxed(&ta->tid); assert(tid >= 0); jl_ptls_t ptls2 = gc_all_tls_states[tid]; @@ -2822,38 +2424,38 @@ mark: { lb = ub - ta->copy_stack; offset = (uintptr_t)stkbuf - lb; } -#endif - if (s) { + #endif + if (s != NULL) { nroots = gc_read_stack(&s->nroots, offset, lb, ub); gc_heap_snapshot_record_task_to_frame_edge(ta, s); - assert(nroots <= UINT32_MAX); - gc_mark_stackframe_t stackdata = {s, 0, (uint32_t)nroots, offset, lb, ub}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack), - &stackdata, sizeof(stackdata), 1); + gc_mark_stack(ptls, s, (uint32_t)nroots, offset, lb, ub); } if (ta->excstack) { - gc_heap_snapshot_record_task_to_frame_edge(ta, ta->excstack); - gc_setmark_buf_(ptls, ta->excstack, bits, sizeof(jl_excstack_t) + - sizeof(uintptr_t)*ta->excstack->reserved_size); - gc_mark_excstack_t stackdata = {ta->excstack, ta->excstack->top, 0, 0}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(excstack), - &stackdata, sizeof(stackdata), 1); + jl_excstack_t *excstack = ta->excstack; + gc_heap_snapshot_record_task_to_frame_edge(ta, excstack); + size_t itr = ta->excstack->top; + gc_setmark_buf_(ptls, excstack, bits, + sizeof(jl_excstack_t) + + sizeof(uintptr_t) * excstack->reserved_size); + gc_mark_excstack(ptls, excstack, itr); } const jl_datatype_layout_t *layout = jl_task_type->layout; assert(layout->fielddesc_type == 0); assert(layout->nfields > 0); uint32_t npointers = layout->npointers; - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; + char *obj8_parent = (char *)ta; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; // assume tasks always reference young objects: set lowest bit uintptr_t nptr = (npointers << 2) | 1 | bits; - gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; - obj8_parent = (char*)ta; - goto obj8_loaded; + new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_markqueue_push(mq, new_obj); + } } else if (vt == jl_string_type) { size_t dtsz = jl_string_len(new_obj) + sizeof(size_t) + 1; @@ -2861,145 +2463,216 @@ mark: { gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); - goto pop; } else { if (__unlikely(!jl_is_datatype(vt))) - gc_assert_datatype_fail(ptls, vt, sp); + gc_assert_datatype_fail(ptls, vt, mq); size_t dtsz = jl_datatype_size(vt); if (update_meta) gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); if (vt == jl_weakref_type) - goto pop; + return; const jl_datatype_layout_t *layout = vt->layout; uint32_t npointers = layout->npointers; if (npointers == 0) - goto pop; - uintptr_t nptr = npointers << 2 | (bits & GC_OLD); - assert((layout->nfields > 0 || layout->fielddesc_type == 3) && "opaque types should have been handled specially"); + return; + uintptr_t nptr = (npointers << 2 | (bits & GC_OLD)); + assert((layout->nfields > 0 || layout->fielddesc_type == 3) && + "opaque types should have been handled specially"); if (layout->fielddesc_type == 0) { - obj8_parent = (char*)new_obj; - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; + char *obj8_parent = (char *)new_obj; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; assert(obj8_begin < obj8_end); - gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; - goto obj8_loaded; + new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_markqueue_push(mq, new_obj); + } } else if (layout->fielddesc_type == 1) { - obj16_parent = (char*)new_obj; - obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); - obj16_end = obj16_begin + npointers; + char *obj16_parent = (char *)new_obj; + uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout); + uint16_t *obj16_end = obj16_begin + npointers; assert(obj16_begin < obj16_end); - gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16), - &markdata, sizeof(markdata), 0); - obj16 = (gc_mark_obj16_t*)sp.data; - goto obj16_loaded; + new_obj = gc_mark_obj16(ptls, obj16_parent, obj16_begin, obj16_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_markqueue_push(mq, new_obj); + } } else if (layout->fielddesc_type == 2) { // This is very uncommon // Do not do store to load forwarding to save some code size - uint32_t *obj32_begin = (uint32_t*)jl_dt_layout_ptrs(layout); + char *obj32_parent = (char *)new_obj; + uint32_t *obj32_begin = (uint32_t *)jl_dt_layout_ptrs(layout); uint32_t *obj32_end = obj32_begin + npointers; - gc_mark_obj32_t markdata = {new_obj, obj32_begin, obj32_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); - goto obj32; + assert(obj32_begin < obj32_end); + new_obj = gc_mark_obj32(ptls, obj32_parent, obj32_begin, obj32_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_markqueue_push(mq, new_obj); + } } else { assert(layout->fielddesc_type == 3); - jl_fielddescdyn_t *desc = (jl_fielddescdyn_t*)jl_dt_layout_fields(layout); + jl_fielddescdyn_t *desc = (jl_fielddescdyn_t *)jl_dt_layout_fields(layout); int old = jl_astaggedvalue(new_obj)->bits.gc & 2; - export_gc_state(ptls, &sp); uintptr_t young = desc->markfunc(ptls, new_obj); - import_gc_state(ptls, &sp); if (old && young) gc_mark_push_remset(ptls, new_obj, young * 4 + 3); - goto pop; } } } } -static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - jl_ptls_t ptls2) +// Used in gc-debug +void gc_mark_loop_(jl_ptls_t ptls, jl_gc_markqueue_t *mq) +{ + while (1) { + void *new_obj = (void *)gc_markqueue_pop(&ptls->mark_queue); + // No more objects to mark + if (new_obj == NULL) { + // TODO: work-stealing comes here... + return; + } + gc_mark_outrefs(ptls, mq, new_obj, 0); + } +} + +// Drain items from worker's own chunkqueue +void gc_drain_own_chunkqueue(jl_ptls_t ptls, jl_gc_markqueue_t *mq) +{ + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + do { + c = gc_chunkqueue_pop(mq); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + gc_mark_loop_(ptls, mq); + } + } while (c.cid != GC_empty_chunk); +} + +// Main mark loop. Single stack (allocated on the heap) of `jl_value_t *` +// is used to keep track of processed items. Maintaning this stack (instead of +// native one) avoids stack overflow when marking deep objects and +// makes it easier to implement parallel marking via work-stealing +JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls) +{ + gc_mark_loop_(ptls, &ptls->mark_queue); + gc_drain_own_chunkqueue(ptls, &ptls->mark_queue); +} + +static void gc_premark(jl_ptls_t ptls2) +{ + arraylist_t *remset = ptls2->heap.remset; + ptls2->heap.remset = ptls2->heap.last_remset; + ptls2->heap.last_remset = remset; + ptls2->heap.remset->len = 0; + ptls2->heap.remset_nptr = 0; + // avoid counting remembered objects + // in `perm_scanned_bytes` + size_t len = remset->len; + void **items = remset->items; + for (size_t i = 0; i < len; i++) { + jl_value_t *item = (jl_value_t *)items[i]; + objprofile_count(jl_typeof(item), 2, 0); + jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED; + } +} + +static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) { jl_task_t *task; task = ptls2->root_task; - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "root task"); } task = jl_atomic_load_relaxed(&ptls2->current_task); - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "current task"); } task = ptls2->next_task; - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "next task"); } task = ptls2->previous_task; - if (task) { // shouldn't be necessary, but no reason not to - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "previous task"); } if (ptls2->previous_exception) { - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception); + gc_try_claim_and_push(mq, ptls2->previous_exception, NULL); gc_heap_snapshot_record_root((jl_value_t*)ptls2->previous_exception, "previous exception"); } } +static void gc_queue_bt_buf(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) +{ + jl_bt_element_t *bt_data = ptls2->bt_data; + size_t bt_size = ptls2->bt_size; + for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { + jl_bt_element_t *bt_entry = bt_data + i; + if (jl_bt_is_native(bt_entry)) + continue; + size_t njlvals = jl_bt_num_jlvals(bt_entry); + for (size_t j = 0; j < njlvals; j++) + gc_try_claim_and_push(mq, jl_bt_entry_jlvalue(bt_entry, j), NULL); + } +} + +static void gc_queue_remset(jl_ptls_t ptls, jl_ptls_t ptls2) +{ + size_t len = ptls2->heap.last_remset->len; + void **items = ptls2->heap.last_remset->items; + for (size_t i = 0; i < len; i++) { + // Objects in the `remset` are already marked, + // so a `gc_try_claim_and_push` wouldn't work here + gc_mark_outrefs(ptls, &ptls->mark_queue, (jl_value_t *)items[i], 1); + } +} + extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED; extern jl_task_t *wait_empty JL_GLOBALLY_ROOTED; // mark the initial root set -static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) +static void gc_mark_roots(jl_gc_markqueue_t *mq) { // modules - gc_mark_queue_obj(gc_cache, sp, jl_main_module); + gc_try_claim_and_push(mq, jl_main_module, NULL); gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module"); - // invisible builtin values - if (jl_an_empty_vec_any != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any); - if (jl_module_init_order != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_module_init_order); + gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL); + gc_try_claim_and_push(mq, jl_module_init_order, NULL); for (size_t i = 0; i < jl_current_modules.size; i += 2) { if (jl_current_modules.table[i + 1] != HT_NOTFOUND) { - gc_mark_queue_obj(gc_cache, sp, jl_current_modules.table[i]); + gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL); gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module"); } } - gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type); + gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL); for (size_t i = 0; i < N_CALL_CACHE; i++) { jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]); - if (v != NULL) { - gc_mark_queue_obj(gc_cache, sp, v); - } - } - if (jl_all_methods != NULL) { - gc_mark_queue_obj(gc_cache, sp, jl_all_methods); + gc_try_claim_and_push(mq, v, NULL); } - if (_jl_debug_method_invalidation != NULL) - gc_mark_queue_obj(gc_cache, sp, _jl_debug_method_invalidation); - if (jl_build_ids != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_build_ids); - + gc_try_claim_and_push(mq, jl_all_methods, NULL); + gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL); + gc_try_claim_and_push(mq, jl_build_ids, NULL); // constants - gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type); - if (cmpswap_names != NULL) - gc_mark_queue_obj(gc_cache, sp, cmpswap_names); - if (wait_empty != NULL) - gc_mark_queue_obj(gc_cache, sp, wait_empty); - gc_mark_queue_obj(gc_cache, sp, jl_global_roots_table); + gc_try_claim_and_push(mq, jl_emptytuple_type, NULL); + gc_try_claim_and_push(mq, cmpswap_names, NULL); + gc_try_claim_and_push(mq, jl_global_roots_table, NULL); } // find unmarked objects that need to be finalized from the finalizer list "list". @@ -3134,47 +2807,6 @@ JL_DLLEXPORT int64_t jl_gc_live_bytes(void) return live_bytes; } -static void jl_gc_premark(jl_ptls_t ptls2) -{ - arraylist_t *remset = ptls2->heap.remset; - ptls2->heap.remset = ptls2->heap.last_remset; - ptls2->heap.last_remset = remset; - ptls2->heap.remset->len = 0; - ptls2->heap.remset_nptr = 0; - - // avoid counting remembered objects & bindings twice - // in `perm_scanned_bytes` - size_t len = remset->len; - void **items = remset->items; - for (size_t i = 0; i < len; i++) { - jl_value_t *item = (jl_value_t*)items[i]; - objprofile_count(jl_typeof(item), 2, 0); - jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED; - } -} - -static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) -{ - size_t len = ptls2->heap.last_remset->len; - void **items = ptls2->heap.last_remset->items; - for (size_t i = 0; i < len; i++) - gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]); -} - -static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) -{ - jl_bt_element_t *bt_data = ptls2->bt_data; - size_t bt_size = ptls2->bt_size; - for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { - jl_bt_element_t *bt_entry = bt_data + i; - if (jl_bt_is_native(bt_entry)) - continue; - size_t njlvals = jl_bt_num_jlvals(bt_entry); - for (size_t j = 0; j < njlvals; j++) - gc_mark_queue_obj(gc_cache, sp, jl_bt_entry_jlvalue(bt_entry, j)); - } -} - size_t jl_maxrss(void); // Only one thread should be running in this function @@ -3182,9 +2814,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) { combine_thread_gc_counts(&gc_num); - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t *mq = &ptls->mark_queue; uint64_t gc_start_time = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; @@ -3196,33 +2826,30 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) - jl_gc_premark(ptls2); + gc_premark(ptls2); } assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - // 2.1. mark every object in the `last_remsets` - jl_gc_queue_remset(gc_cache, &sp, ptls2); - // 2.2. mark every thread local root - jl_gc_queue_thread_local(gc_cache, &sp, ptls2); - // 2.3. mark any managed objects in the backtrace buffer - // TODO: treat these as roots for gc_heap_snapshot_record - jl_gc_queue_bt_buf(gc_cache, &sp, ptls2); + if (ptls2 != NULL) { + // 2.1. mark every thread local root + gc_queue_thread_local(mq, ptls2); + // 2.2. mark any managed objects in the backtrace buffer + // TODO: treat these as roots for gc_heap_snapshot_record + gc_queue_bt_buf(mq, ptls2); + // 2.3. mark every object in the `last_remsets` and `rem_binding` + gc_queue_remset(ptls, ptls2); + } } // 3. walk roots - mark_roots(gc_cache, &sp); + gc_mark_roots(mq); if (gc_cblist_root_scanner) { - export_gc_state(ptls, &sp); gc_invoke_callbacks(jl_gc_cb_root_scanner_t, gc_cblist_root_scanner, (collection)); - import_gc_state(ptls, &sp); } - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop(ptls); gc_num.since_sweep += gc_num.allocd; JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes); gc_settime_premark_end(); @@ -3243,9 +2870,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - sweep_finalizer_list(&ptls2->finalizers); + if (ptls2 != NULL) + sweep_finalizer_list(&ptls2->finalizers); } if (prev_sweep_full) { sweep_finalizer_list(&finalizer_list_marked); @@ -3254,15 +2880,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + if (ptls2 != NULL) + gc_mark_finlist(mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); + gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len); // "Flush" the mark stack before flipping the reset_age bit // so that the objects are not incorrectly reset. - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop(ptls); // Conservative marking relies on age to tell allocated objects // and freelist entries apart. mark_reset_age = !jl_gc_conservative_gc_support_enabled(); @@ -3270,8 +2894,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // `to_finalize` list. These objects are only reachable from this list // and should not be referenced by any old objects so this won't break // the GC invariant. - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(mq, &to_finalize, 0); + gc_mark_loop(ptls); mark_reset_age = 0; gc_settime_postmark_end(); @@ -3297,9 +2921,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - nptr += ptls2->heap.remset_nptr; + if (ptls2 != NULL) + nptr += ptls2->heap.remset_nptr; } // many pointers in the intergen frontier => "quick" mark is not quick @@ -3349,7 +2972,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) promoted_bytes = 0; } scanned_bytes = 0; - // 5. start sweeping + // 6. start sweeping uint64_t start_sweep_time = jl_hrtime(); JL_PROBE_GC_SWEEP_BEGIN(sweep_full); sweep_weak_refs(); @@ -3370,7 +2993,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) gc_num.sweep_time = sweep_time; // sweeping is over - // 6. if it is a quick sweep, put back the remembered objects in queued state + // 7. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { @@ -3400,7 +3023,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } #endif - _report_gc_finished(pause, gc_num.freed, sweep_full, recollect); gc_final_pause_end(gc_start_time, gc_end_time); @@ -3417,17 +3039,18 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) live_bytes += -gc_num.freed + gc_num.since_sweep; if (collection == JL_GC_AUTO) { - // If the current interval is larger than half the live data decrease the interval - int64_t half = live_bytes/2; - if (gc_num.interval > half) gc_num.interval = half; - // But never go below default - if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval; + // If the current interval is larger than half the live data decrease the interval + int64_t half = live_bytes/2; + if (gc_num.interval > half) gc_num.interval = half; + // But never go below default + if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval; } if (gc_num.interval + live_bytes > max_total_memory) { if (live_bytes < max_total_memory) { gc_num.interval = max_total_memory - live_bytes; - } else { + } + else { // We can't stay under our goal so let's go back to // the minimum interval and hope things get better gc_num.interval = default_collect_interval; @@ -3541,16 +3164,15 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) errno = last_errno; } -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) +void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; assert(gc_n_threads); for (size_t i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2) - jl_gc_queue_thread_local(gc_cache, sp, ptls2); + gc_queue_thread_local(mq, ptls2); } - mark_roots(gc_cache, sp); + gc_mark_roots(mq); } // allocator entry points @@ -3586,10 +3208,16 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; gc_cache->nbig_obj = 0; - size_t init_size = 1024; - gc_cache->pc_stack = (void**)malloc_s(init_size * sizeof(void*)); - gc_cache->pc_stack_end = gc_cache->pc_stack + init_size; - gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t)); + + // Initialize GC mark-queue + size_t init_size = (1 << 18); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + mq->start = (jl_value_t **)malloc_s(init_size * sizeof(jl_value_t *)); + mq->current = mq->start; + mq->end = mq->start + init_size; + size_t cq_init_size = (1 << 14); + mq->current_chunk = mq->chunk_start = (jl_gc_chunk_t *)malloc_s(cq_init_size * sizeof(jl_gc_chunk_t)); + mq->chunk_end = mq->chunk_start + cq_init_size; memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); @@ -3631,9 +3259,6 @@ void jl_gc_init(void) if (high_water_mark < max_total_memory) max_total_memory = high_water_mark; - - jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL}; - gc_mark_loop(NULL, sp); t_start = jl_hrtime(); } @@ -3657,7 +3282,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -3672,7 +3297,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -3688,7 +3313,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; free(p); - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; jl_atomic_store_relaxed(&ptls->gc_num.freed, jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); @@ -3701,7 +3326,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); if (sz < old) diff --git a/src/gc.h b/src/gc.h index 35767b75ea3f8..930f7f3c30594 100644 --- a/src/gc.h +++ b/src/gc.h @@ -42,7 +42,6 @@ extern "C" { typedef struct { uint64_t num; uint64_t next; - uint64_t min; uint64_t interv; uint64_t max; @@ -83,162 +82,26 @@ typedef struct { uint64_t total_mark_time; } jl_gc_num_t; -enum { - GC_MARK_L_marked_obj, - GC_MARK_L_scan_only, - GC_MARK_L_finlist, - GC_MARK_L_objarray, - GC_MARK_L_array8, - GC_MARK_L_array16, - GC_MARK_L_obj8, - GC_MARK_L_obj16, - GC_MARK_L_obj32, - GC_MARK_L_stack, - GC_MARK_L_excstack, - GC_MARK_L_module_binding, - _GC_MARK_L_MAX -}; - -// The following structs (`gc_mark_*_t`) contain iterator state used for the -// scanning of various object types. -// -// The `nptr` member records the number of pointers slots referenced by -// an object to be used in the full collection heuristics as well as whether the object -// references young objects. -// `nptr >> 2` is the number of pointers fields referenced by the object. -// The lowest bit of `nptr` is set if the object references young object. -// The 2nd lowest bit of `nptr` is the GC old bits of the object after marking. -// A `0x3` in the low bits means that the object needs to be in the remset. - -// An generic object that's marked and needs to be scanned -// The metadata might need update too (depend on the PC) -typedef struct { - jl_value_t *obj; // The object - uintptr_t tag; // The tag with the GC bits masked out - uint8_t bits; // The GC bits after tagging (`bits & 1 == 1`) -} gc_mark_marked_obj_t; - -// An object array. This can come from an array, svec, or the using array or a module -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint32_t step; // Number of pointers to jump between marks - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_objarray_t; - -// A normal object with 8bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint8_t *begin; // Current field descriptor. - uint8_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj8_t; - -// A normal object with 16bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint16_t *begin; // Current field descriptor. - uint16_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj16_t; - -// A normal object with 32bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint32_t *begin; // Current field descriptor. - uint32_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj32_t; - -typedef struct { - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint8_t *rebegin; - gc_mark_obj8_t elem; -} gc_mark_array8_t; - -typedef struct { - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint16_t *rebegin; - gc_mark_obj16_t elem; -} gc_mark_array16_t; - -// Stack frame -typedef struct { - jl_gcframe_t *s; // The current stack frame - uint32_t i; // The current slot index in the frame - uint32_t nroots; // `nroots` fields in the frame - // Parameters to mark the copy_stack range. - uintptr_t offset; - uintptr_t lb; - uintptr_t ub; -} gc_mark_stackframe_t; - -// Exception stack data -typedef struct { - jl_excstack_t *s; // Stack of exceptions - size_t itr; // Iterator into exception stack - size_t bt_index; // Current backtrace buffer entry index - size_t jlval_index; // Index into GC managed values for current bt entry -} gc_mark_excstack_t; - -// Module bindings. This is also the beginning of module scanning. -// The loop will start marking other references in a module after the bindings are marked -typedef struct { - jl_module_t *parent; // The parent module to trigger write barrier on. - jl_binding_t **begin; // The first slot to be scanned. - jl_binding_t **end; // The end address (after the last slot to be scanned) - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_binding_t; - -// Finalizer (or object) list -typedef struct { - jl_value_t **begin; - jl_value_t **end; -} gc_mark_finlist_t; - -// This is used to determine the max size of the data objects on the data stack. -// We'll use this size to determine the size of the data stack corresponding to a -// PC stack size. Since the data objects are not all of the same size, we'll waste -// some memory on the data stack this way but that size is unlikely going to be significant. -union _jl_gc_mark_data { - gc_mark_marked_obj_t marked; - gc_mark_objarray_t objarray; - gc_mark_array8_t array8; - gc_mark_array16_t array16; - gc_mark_obj8_t obj8; - gc_mark_obj16_t obj16; - gc_mark_obj32_t obj32; - gc_mark_stackframe_t stackframe; - gc_mark_excstack_t excstackframe; - gc_mark_binding_t binding; - gc_mark_finlist_t finlist; -}; - -// Pop a data struct from the mark data stack (i.e. decrease the stack pointer) -// This should be used after dispatch and therefore the pc stack pointer is already popped from -// the stack. -STATIC_INLINE void *gc_pop_markdata_(jl_gc_mark_sp_t *sp, size_t size) -{ - jl_gc_mark_data_t *data = (jl_gc_mark_data_t *)(((char*)sp->data) - size); - sp->data = data; - return data; -} -#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type))) - -// Re-push a frame to the mark stack (both data and pc) -// The data and pc are expected to be on the stack (or updated in place) already. -// Mainly useful to pause the current scanning in order to scan an new object. -STATIC_INLINE void *gc_repush_markdata_(jl_gc_mark_sp_t *sp, size_t size) JL_NOTSAFEPOINT -{ - jl_gc_mark_data_t *data = sp->data; - sp->pc++; - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + size); - return data; -} -#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type))) +typedef enum { + GC_empty_chunk, + GC_objary_chunk, + GC_ary8_chunk, + GC_ary16_chunk, + GC_finlist_chunk, +} gc_chunk_id_t; + +typedef struct _jl_gc_chunk_t { + gc_chunk_id_t cid; + struct _jl_value_t *parent; + struct _jl_value_t **begin; + struct _jl_value_t **end; + void *elem_begin; + void *elem_end; + uint32_t step; + uintptr_t nptr; +} jl_gc_chunk_t; + +#define MAX_REFS_AT_ONCE (1 << 16) // layout for big (>2k) objects @@ -507,23 +370,16 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE *list = hdr; } -STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) -{ - sp->pc = gc_cache->pc_stack; - sp->data = gc_cache->data_stack; - sp->pc_start = gc_cache->pc_stack; - sp->pc_end = gc_cache->pc_stack_end; -} - -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp); -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - arraylist_t *list, size_t start); -void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp); +void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, + jl_value_t **fl_end) JL_NOTSAFEPOINT; +void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, + size_t start) JL_NOTSAFEPOINT; +void gc_mark_loop_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); +void gc_mark_loop(jl_ptls_t ptls); void sweep_stack_pools(void); void jl_gc_debug_init(void); -extern void *gc_mark_label_addrs[_GC_MARK_L_MAX]; - // GC pages void jl_gc_init_page(void); @@ -608,7 +464,6 @@ static inline void gc_verify_tags(void) } #endif - #ifdef GC_VERIFY extern jl_value_t *lostval; void gc_verify(jl_ptls_t ptls); @@ -649,10 +504,9 @@ extern int gc_verifying; #define gc_verifying (0) #endif - int gc_slot_to_fieldidx(void *_obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT; int gc_slot_to_arrayidx(void *_obj, void *begin) JL_NOTSAFEPOINT; -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset); +NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset) JL_NOTSAFEPOINT; #ifdef GC_DEBUG_ENV JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; diff --git a/src/julia_internal.h b/src/julia_internal.h index 7565967b0a270..8f7b49239c5e3 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -576,8 +576,8 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT } } -void jl_gc_debug_print_status(void); -JL_DLLEXPORT void jl_gc_debug_critical_error(void); +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT; void jl_print_gc_stats(JL_STREAM *s); void jl_gc_reset_alloc_count(void); uint32_t jl_get_gs_ctr(void); @@ -1190,7 +1190,7 @@ size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_cont #endif JL_DLLEXPORT jl_value_t *jl_get_backtrace(void); void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *ct); -JL_DLLEXPORT void jl_raise_debugger(void); +JL_DLLEXPORT void jl_raise_debugger(void) JL_NOTSAFEPOINT; int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT; void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT; diff --git a/src/julia_threads.h b/src/julia_threads.h index 5874225c12eac..719de95c9e375 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -170,16 +170,14 @@ typedef struct { arraylist_t free_stacks[JL_N_STACK_POOLS]; } jl_thread_heap_t; -// Cache of thread local change to global metadata during GC -// This is sync'd after marking. -typedef union _jl_gc_mark_data jl_gc_mark_data_t; - typedef struct { - void **pc; // Current stack address for the pc (up growing) - jl_gc_mark_data_t *data; // Current stack address for the data (up growing) - void **pc_start; // Cached value of `gc_cache->pc_stack` - void **pc_end; // Cached value of `gc_cache->pc_stack_end` -} jl_gc_mark_sp_t; + struct _jl_gc_chunk_t *chunk_start; + struct _jl_gc_chunk_t *current_chunk; + struct _jl_gc_chunk_t *chunk_end; + struct _jl_value_t **start; + struct _jl_value_t **current; + struct _jl_value_t **end; +} jl_gc_markqueue_t; typedef struct { // thread local increment of `perm_scanned_bytes` @@ -197,9 +195,6 @@ typedef struct { // this makes sure that a single objects can only appear once in // the lists (the mark bit cannot be flipped to `0` without sweeping) void *big_obj[1024]; - void **pc_stack; - void **pc_stack_end; - jl_gc_mark_data_t *data_stack; } jl_gc_mark_cache_t; struct _jl_bt_element_t; @@ -265,9 +260,9 @@ typedef struct _jl_tls_states_t { #endif jl_thread_t system_id; arraylist_t finalizers; + jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs; - jl_gc_mark_sp_t gc_mark_sp; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception; diff --git a/src/partr.c b/src/partr.c index 3e71cb6627e07..4faff409c711a 100644 --- a/src/partr.c +++ b/src/partr.c @@ -78,7 +78,7 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA // GC functions used extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, - jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT; + jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT; // parallel task runtime // --- diff --git a/src/support/dtypes.h b/src/support/dtypes.h index d49ae0b22b5f9..891c091413084 100644 --- a/src/support/dtypes.h +++ b/src/support/dtypes.h @@ -117,6 +117,7 @@ typedef intptr_t ssize_t; #define LLT_FREE(x) free(x) #define STATIC_INLINE static inline +#define FORCE_INLINE static inline __attribute__((always_inline)) #if defined(_OS_WINDOWS_) && !defined(_COMPILER_GCC_) # define NOINLINE __declspec(noinline)