Skip to content

Commit

Permalink
rearrange jl_delete_thread to be thread-safe (#56097)
Browse files Browse the repository at this point in the history
Prior to this, especially on macOS, the gc-safepoint here would cause
the process to segfault as we had already freed the current_task state.
Rearrange this code so that the GC interactions (except for the atomic
store to current_task) are all handled before entering GC safe, and then
signaling the thread is deleted (via setting current_task = NULL,
published by jl_unlock_profile_wr to other threads) is last.

```
ERROR: Exception handler triggered on unmanaged thread.
Process 53827 stopped
* thread #5, stop reason = EXC_BAD_ACCESS (code=2, address=0x100018008)
    frame #0: 0x0000000100b74344 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_set(ptls=0x000000011f8b3200, state='\x02', old_state=<unavailable>) at julia_threads.h:272:9 [opt]
   269 	    assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD);
   270 	    jl_atomic_store_release(&ptls->gc_state, state);
   271 	    if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE)
-> 272 	        jl_gc_safepoint_(ptls);
   273 	    return old_state;
   274 	}
   275 	STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls,
Target 0: (julia) stopped.
(lldb) up
frame #1: 0x0000000100b74320 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_save_and_set(ptls=0x000000011f8b3200, state='\x02') at julia_threads.h:278:12 [opt]
   275 	STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls,
   276 	                                              int8_t state)
   277 	{
-> 278 	    return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state));
   279 	}
   280 	#ifdef __clang_gcanalyzer__
   281 	// these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically)
(lldb)
frame #2: 0x0000000100b7431c libjulia-internal.1.12.0.dylib`jl_delete_thread(value=0x000000011f8b3200) at threading.c:537:11 [opt]
   534 	    ptls->root_task = NULL;
   535 	    jl_free_thread_gc_state(ptls);
   536 	    // then park in safe-region
-> 537 	    (void)jl_gc_safe_enter(ptls);
   538 	}
```

(test incorporated into #55793)

(cherry picked from commit 0d09f3d,
resolving conflicts from not having backported #52198)
  • Loading branch information
vtjnash committed Jan 2, 2025
1 parent 6e28217 commit 9e39755
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 21 deletions.
6 changes: 3 additions & 3 deletions src/gc-stacks.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
}


static void free_stack(void *stkbuf, size_t bufsz)
static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
{
VirtualFree(stkbuf, 0, MEM_RELEASE);
jl_atomic_fetch_add(&num_stack_mappings, -1);
Expand All @@ -68,7 +68,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
return stk;
}

static void free_stack(void *stkbuf, size_t bufsz)
static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
{
munmap(stkbuf, bufsz);
jl_atomic_fetch_add(&num_stack_mappings, -1);
Expand Down Expand Up @@ -110,7 +110,7 @@ static unsigned select_pool(size_t nb) JL_NOTSAFEPOINT
}


static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz)
static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
{
#ifdef _COMPILER_ASAN_ENABLED_
__asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
Expand Down
2 changes: 1 addition & 1 deletion src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -1043,7 +1043,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void);
JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void);
JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz);
JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner) JL_NOTSAFEPOINT;
JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz);
JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
JL_DLLEXPORT void jl_gc_use(jl_value_t *a);
// Set GC memory trigger in bytes for greedy memory collecting
JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
Expand Down
41 changes: 24 additions & 17 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,29 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
// prior unsafe-region (before we let it release the stack memory)
(void)jl_gc_unsafe_enter(ptls);
scheduler_delete_thread(ptls);
// need to clear pgcstack and eh, but we can clear everything now too
jl_task_t *ct = jl_atomic_load_relaxed(&ptls->current_task);
jl_task_frame_noreturn(ct);
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
// the system will probably free this stack memory soon
// so prevent any other thread from accessing it later
if (ct != ptls->root_task)
jl_task_frame_noreturn(ptls->root_task);
}
else {
// Uh oh. The user cleared the sticky bit so it started running
// elsewhere, then called pthread_exit on this thread from another
// Task, which will free the stack memory of that root task soon. This
// is not recoverable. Though we could just hang here, a fatal message
// is likely better.
jl_safe_printf("fatal: thread exited from wrong Task.\n");
abort();
}
ptls->previous_exception = NULL;
// allow the page root_task is on to be freed
ptls->root_task = NULL;
// park in safe-region from here on (this may run GC again)
(void)jl_gc_safe_enter(ptls);
// try to free some state we do not need anymore
#ifndef _OS_WINDOWS_
void *signal_stack = ptls->signal_stack;
Expand Down Expand Up @@ -481,21 +504,7 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
#else
pthread_mutex_lock(&in_signal_lock);
#endif
// need to clear pgcstack and eh, but we can clear everything now too
jl_task_frame_noreturn(jl_atomic_load_relaxed(&ptls->current_task));
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
// the system will probably free this stack memory soon
// so prevent any other thread from accessing it later
jl_task_frame_noreturn(ptls->root_task);
}
else {
// Uh oh. The user cleared the sticky bit so it started running
// elsewhere, then called pthread_exit on this thread. This is not
// recoverable. Though we could just hang here, a fatal message is better.
jl_safe_printf("fatal: thread exited from wrong Task.\n");
abort();
}
jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead
jl_atomic_store_relaxed(&ptls->current_task, NULL); // indicate dead
// finally, release all of the locks we had grabbed
#ifdef _OS_WINDOWS_
jl_unlock_profile_wr();
Expand All @@ -506,8 +515,6 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
#else
pthread_mutex_unlock(&in_signal_lock);
#endif
// then park in safe-region
(void)jl_gc_safe_enter(ptls);
}

//// debugging hack: if we are exiting too fast for error message printing on threads,
Expand Down

0 comments on commit 9e39755

Please sign in to comment.