Skip to content

Commit

Permalink
Fix deadlock in windows profiler
Browse files Browse the repository at this point in the history
The deadlock here happens when the main thread is trying to register
a new JIT object while the profiler is triggered. The main thread
will acquire the objectmap lock and then get suspended. Then the profiler
thread will attempt to stackwalk, but in order to do that effectively,
it needs to look at the object map to find the unwind info for the function
on the stack. We don't have the same problem on linux because the main
thread runs the backtracing. On linux and mac the registration process
works differently and on linux at least the backtracing happens on a
different thread so we may not have this problem. However, #13294 says
that it is also a problem on OS X. We should keep an eye out for it.
For the moment, just try to fix this by terminating the stack unwind
when we fail to acquire the lock. That's not ideal, because it reduces
the quality of the profiler info, but only in situations where we would
previous deadlock. This entire code needs a rewrite, but for now, I'm
just hoping to get CI to stopping deadlocking on us.
  • Loading branch information
Keno committed Jan 21, 2020
1 parent d759b5b commit 21dfe6c
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 29 deletions.
27 changes: 27 additions & 0 deletions src/debuginfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,14 @@ class JuliaJITEventListener: public JITEventListener
uv_rwlock_rdlock(&threadsafe);
return objectmap;
}

Optional<std::map<size_t, ObjectInfo, revcomp>*> trygetObjectMap()
{
if (0 != uv_rwlock_tryrdlock(&threadsafe)) {
return &objectmap;
}
return {};
}
};

JL_DLLEXPORT void ORCNotifyObjectEmitted(JITEventListener *Listener,
Expand Down Expand Up @@ -1662,3 +1670,22 @@ uint64_t jl_getUnwindInfo(uint64_t dwAddr)
uv_rwlock_rdunlock(&threadsafe);
return ipstart;
}

extern "C"
uint64_t jl_trygetUnwindInfo(uint64_t dwAddr)
{
// Might be called from unmanaged thread
Optional<std::map<size_t, ObjectInfo, revcomp>*> maybeobjmap = jl_jit_events->trygetObjectMap();
if (maybeobjmap) {
std::map<size_t, ObjectInfo, revcomp> &objmap = **maybeobjmap;
std::map<size_t, ObjectInfo, revcomp>::iterator it = objmap.lower_bound(dwAddr);
uint64_t ipstart = 0; // ip of the start of the section (if found)
if (it != objmap.end() && dwAddr < it->first + it->second.SectionSize) {
ipstart = (uint64_t)(uintptr_t)(*it).first;
}
uv_rwlock_rdunlock(&threadsafe);
return ipstart;
}
return 0;
}

3 changes: 2 additions & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,7 @@ typedef struct {

// Might be called from unmanaged thread
uint64_t jl_getUnwindInfo(uint64_t dwBase);
uint64_t jl_trygetUnwindInfo(uint64_t dwBase);
#ifdef _OS_WINDOWS_
#include <dbghelp.h>
JL_DLLEXPORT EXCEPTION_DISPOSITION __julia_personality(
Expand Down Expand Up @@ -757,7 +758,7 @@ size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTS
// Record backtrace from a signal handler. `ctx` is the context of the code
// which was asynchronously interrupted.
size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx,
jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
jl_gcframe_t *pgcstack, int lockless) JL_NOTSAFEPOINT;
#ifdef LIBOSXUNWIND
size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/signal-handling.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data,
if (context) {
// Must avoid extended backtrace frames here unless we're sure bt_data
// is properly rooted.
*bt_size = n = rec_backtrace_ctx(bt_data, JL_MAX_BT_SIZE, context, NULL);
*bt_size = n = rec_backtrace_ctx(bt_data, JL_MAX_BT_SIZE, context, NULL, 1);
}
for (i = 0; i < n; i += jl_bt_entry_size(bt_data + i)) {
jl_print_bt_entry_codeloc(bt_data + i);
Expand Down
4 changes: 2 additions & 2 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
if (!ptls2->safe_restore) {
assert(exception);
ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE,
(bt_context_t*)&state, ptls2->pgcstack);
(bt_context_t*)&state, ptls2->pgcstack, 0);
ptls2->sig_exception = exception;
}
jl_call_in_state(ptls2, &state, &jl_sig_throw);
Expand Down Expand Up @@ -448,7 +448,7 @@ void *mach_profile_listener(void *arg)

if (forceDwarf == 0) {
// Save the backtrace
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL, 1);
}
else if (forceDwarf == 1) {
bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
Expand Down
6 changes: 3 additions & 3 deletions src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ static void jl_throw_in_ctx(jl_ptls_t ptls, jl_value_t *e, int sig, void *sigctx
{
if (!ptls->safe_restore)
ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE,
jl_to_bt_context(sigctx), ptls->pgcstack);
jl_to_bt_context(sigctx), ptls->pgcstack, 0);
ptls->sig_exception = e;
jl_call_in_ctx(ptls, &jl_sig_throw, sig, sigctx);
}
Expand Down Expand Up @@ -670,7 +670,7 @@ static void *signal_listener(void *arg)
if (critical) {
bt_size += rec_backtrace_ctx(bt_data + bt_size,
JL_MAX_BT_SIZE / jl_n_threads - 1,
signal_context, NULL);
signal_context, NULL, 1);
bt_data[bt_size++].uintptr = 0;
}

Expand All @@ -689,7 +689,7 @@ static void *signal_listener(void *arg)
} else {
// Get backtrace data
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
bt_size_max - bt_size_cur - 1, signal_context, NULL);
bt_size_max - bt_size_cur - 1, signal_context, NULL, 1);
}
ptls->safe_restore = old_buf;

Expand Down
6 changes: 3 additions & 3 deletions src/signals-win.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ static void JL_NORETURN start_backtrace_fiber(void)
{
jl_ptls_t ptls = jl_get_ptls_states();
// collect the backtrace
ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, error_ctx, ptls->pgcstack);
ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, error_ctx, ptls->pgcstack, 0);
// switch back to the execution fiber
jl_setcontext(&error_return_fiber);
abort();
Expand All @@ -130,7 +130,7 @@ void jl_throw_in_ctx(jl_value_t *excpt, PCONTEXT ctxThread)
assert(excpt != NULL);
ptls->bt_size = 0;
if (excpt != jl_stackovf_exception) {
ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, ctxThread, ptls->pgcstack);
ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, ctxThread, ptls->pgcstack, 0);
}
else if (have_backtrace_fiber) {
error_ctx = ctxThread;
Expand Down Expand Up @@ -345,7 +345,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
}
// Get backtrace data
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
bt_size_max - bt_size_cur - 1, &ctxThread, NULL, 1);
// Mark the end of this block with 0
bt_data_prof[bt_size_cur].uintptr = 0;
bt_size_cur++;
Expand Down
60 changes: 41 additions & 19 deletions src/stackwalk.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ void jl_unw_get(void *context) {};
extern "C" {
#endif

static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *context) JL_NOTSAFEPOINT;
static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp) JL_NOTSAFEPOINT;
static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *context, int lockless) JL_NOTSAFEPOINT;
static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp, int lockless) JL_NOTSAFEPOINT;

static jl_gcframe_t *is_enter_interpreter_frame(jl_gcframe_t **ppgcstack, uintptr_t sp) JL_NOTSAFEPOINT
{
Expand Down Expand Up @@ -67,7 +67,7 @@ static jl_gcframe_t *is_enter_interpreter_frame(jl_gcframe_t **ppgcstack, uintpt
// elements written to bt_data (and sp if non-NULL) are returned in bt_size.
int jl_unw_stepn(bt_cursor_t *cursor, jl_bt_element_t *bt_data, size_t *bt_size,
uintptr_t *sp, size_t maxsize, int skip, jl_gcframe_t **ppgcstack,
int from_signal_handler) JL_NOTSAFEPOINT
int from_signal_handler, int lockless) JL_NOTSAFEPOINT
{
volatile size_t n = 0;
volatile int need_more_space = 0;
Expand Down Expand Up @@ -96,7 +96,7 @@ int jl_unw_stepn(bt_cursor_t *cursor, jl_bt_element_t *bt_data, size_t *bt_size,
need_more_space = 1;
break;
}
have_more_frames = jl_unw_step(cursor, &return_ip, &thesp);
have_more_frames = jl_unw_step(cursor, &return_ip, &thesp, lockless);
if (skip > 0) {
skip--;
continue;
Expand Down Expand Up @@ -179,13 +179,13 @@ int jl_unw_stepn(bt_cursor_t *cursor, jl_bt_element_t *bt_data, size_t *bt_size,
}

NOINLINE size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize,
bt_context_t *context, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT
bt_context_t *context, jl_gcframe_t *pgcstack, int lockless) JL_NOTSAFEPOINT
{
bt_cursor_t cursor;
if (!jl_unw_init(&cursor, context))
if (!jl_unw_init(&cursor, context, lockless))
return 0;
size_t bt_size = 0;
jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, 0, &pgcstack, 1);
jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, 0, &pgcstack, 1, lockless);
return bt_size;
}

Expand All @@ -201,10 +201,10 @@ NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip
jl_unw_get(&context);
jl_gcframe_t *pgcstack = jl_pgcstack;
bt_cursor_t cursor;
if (!jl_unw_init(&cursor, &context))
if (!jl_unw_init(&cursor, &context, 0))
return 0;
size_t bt_size = 0;
jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, skip + 1, &pgcstack, 0);
jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, skip + 1, &pgcstack, 0, 0);
return bt_size;
}

Expand Down Expand Up @@ -235,7 +235,7 @@ JL_DLLEXPORT jl_value_t *jl_backtrace_from_here(int returnsp, int skip)
memset(&context, 0, sizeof(context));
jl_unw_get(&context);
jl_gcframe_t *pgcstack = jl_pgcstack;
if (jl_unw_init(&cursor, &context)) {
if (jl_unw_init(&cursor, &context, 0)) {
// Skip frame for jl_backtrace_from_here itself
skip += 1;
size_t offset = 0;
Expand All @@ -249,7 +249,7 @@ JL_DLLEXPORT jl_value_t *jl_backtrace_from_here(int returnsp, int skip)
}
size_t size_incr = 0;
have_more_frames = jl_unw_stepn(&cursor, (jl_bt_element_t*)jl_array_data(ip) + offset,
&size_incr, sp_ptr, maxincr, skip, &pgcstack, 0);
&size_incr, sp_ptr, maxincr, skip, &pgcstack, 0, 0);
skip = 0;
offset += size_incr;
}
Expand Down Expand Up @@ -415,6 +415,25 @@ static DWORD64 WINAPI JuliaGetModuleBase64(
#endif
}

static DWORD64 WINAPI JuliaAsyncGetModuleBase64(
_In_ HANDLE hProcess,
_In_ DWORD64 dwAddr)
{
//jl_printf(JL_STDOUT, "lookup base %d\n", dwAddr);
#ifdef _CPU_X86_64_
return JuliaGetModuleBase64(hProcess, dwAddr);
#else
if (dwAddr == HistoryTable.dwAddr) return HistoryTable.ImageBase;
DWORD64 ImageBase = jl_trygetUnwindInfo(dwAddr);
if (ImageBase) {
HistoryTable.dwAddr = dwAddr;
HistoryTable.ImageBase = ImageBase;
return ImageBase;
}
return SymGetModuleBase64(hProcess, dwAddr);
#endif
}

// Might be called from unmanaged thread.
int needsSymRefreshModuleList;
BOOL (WINAPI *hSymRefreshModuleList)(HANDLE);
Expand All @@ -427,7 +446,7 @@ void jl_refresh_dbg_module_list(void)
needsSymRefreshModuleList = 0;
}
}
static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *Context)
static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *Context, int lockless)
{
jl_refresh_dbg_module_list();
#if !defined(_CPU_X86_64_)
Expand All @@ -444,7 +463,8 @@ static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *Context)
cursor->stackframe.AddrFrame.Mode = AddrModeFlat;
cursor->context = *Context;
BOOL result = StackWalk64(IMAGE_FILE_MACHINE_I386, GetCurrentProcess(), hMainThread,
&cursor->stackframe, &cursor->context, NULL, JuliaFunctionTableAccess64, JuliaGetModuleBase64, NULL);
&cursor->stackframe, &cursor->context, NULL, JuliaFunctionTableAccess64,
lockless ? JuliaAsyncGetModuleBase64 : JuliaGetModuleBase64, NULL);
jl_in_stackwalk = 0;
return result;
#else
Expand All @@ -467,8 +487,10 @@ static int readable_pointer(LPCVOID pointer)
return 1;
}

static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp)
static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp, int lockless)
{
DWORD64 WINAPI (*GetModuleBase64)(HANDLE, DWORD64) = lockless ?
JuliaAsyncGetModuleBase64 : JuliaGetModuleBase64;
// Might be called from unmanaged thread.
#ifndef _CPU_X86_64_
*ip = (uintptr_t)cursor->stackframe.AddrPC.Offset;
Expand All @@ -482,7 +504,7 @@ static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp)
}

BOOL result = StackWalk64(IMAGE_FILE_MACHINE_I386, GetCurrentProcess(), hMainThread,
&cursor->stackframe, &cursor->context, NULL, JuliaFunctionTableAccess64, JuliaGetModuleBase64, NULL);
&cursor->stackframe, &cursor->context, NULL, JuliaFunctionTableAccess64, GetModuleBase64, NULL);
return result;
#else
*ip = (uintptr_t)cursor->Rip;
Expand All @@ -495,7 +517,7 @@ static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp)
return cursor->Rip != 0;
}

DWORD64 ImageBase = JuliaGetModuleBase64(GetCurrentProcess(), cursor->Rip);
DWORD64 ImageBase = GetModuleBase64(GetCurrentProcess(), cursor->Rip);
if (!ImageBase)
return 0;

Expand Down Expand Up @@ -535,7 +557,7 @@ static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *context)
return unw_init_local(cursor, context) == 0;
}

static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp)
static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp, int lockless)
{
unw_word_t reg;
if (unw_get_reg(cursor, UNW_REG_IP, &reg) < 0)
Expand All @@ -555,7 +577,7 @@ NOINLINE size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize
bt_cursor_t cursor;
if (unw_init_local_dwarf(&cursor, context) != UNW_ESUCCESS)
return 0;
jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, 0, &pgcstack, 1);
jl_unw_stepn(&cursor, bt_data, &bt_size, NULL, maxsize, 0, &pgcstack, 1, 0);
return bt_size;
}
#endif
Expand All @@ -567,7 +589,7 @@ static int jl_unw_init(bt_cursor_t *cursor, bt_context_t *context)
return 0;
}

static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp)
static int jl_unw_step(bt_cursor_t *cursor, uintptr_t *ip, uintptr_t *sp, int lockless)
{
return 0;
}
Expand Down

0 comments on commit 21dfe6c

Please sign in to comment.