From 57017f6de66d88436bd80862c23a1ed0bff69648 Mon Sep 17 00:00:00 2001 From: Julien Portalier Date: Sat, 19 Oct 2024 21:09:29 +0200 Subject: [PATCH] Fix Deadlock with parallel stop-world/fork calls in MT (#15096) Trying to stop the world from a thread while threads are forking leads to a deadlock situation in glibc (at least) because we disable the reception of *all* signals while we fork. Since the suspend and resume signals are handled directly and not processed through the event loop (for obvious reasons) we can safely keep these signals enabled. Apparently it's even safer. --- src/crystal/system/unix/process.cr | 7 +++++++ src/crystal/system/unix/pthread.cr | 19 ++++++++++++++++++- src/gc/boehm.cr | 14 ++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/crystal/system/unix/process.cr b/src/crystal/system/unix/process.cr index 420030f8ba53..0eb58231900e 100644 --- a/src/crystal/system/unix/process.cr +++ b/src/crystal/system/unix/process.cr @@ -176,7 +176,14 @@ struct Crystal::System::Process newmask = uninitialized LibC::SigsetT oldmask = uninitialized LibC::SigsetT + # block signals while we fork, so the child process won't forward signals it + # may receive to the parent through the signal pipe, but make sure to not + # block stop-the-world signals as it appears to create deadlocks in glibc + # for example; this is safe because these signal handlers musn't be + # registered through `Signal.trap` but directly through `sigaction`. LibC.sigfillset(pointerof(newmask)) + LibC.sigdelset(pointerof(newmask), System::Thread.sig_suspend) + LibC.sigdelset(pointerof(newmask), System::Thread.sig_resume) ret = LibC.pthread_sigmask(LibC::SIG_SETMASK, pointerof(newmask), pointerof(oldmask)) raise RuntimeError.from_errno("Failed to disable signals") unless ret == 0 diff --git a/src/crystal/system/unix/pthread.cr b/src/crystal/system/unix/pthread.cr index 50a0fc56e818..bbdfcbc3d41c 100644 --- a/src/crystal/system/unix/pthread.cr +++ b/src/crystal/system/unix/pthread.cr @@ -249,7 +249,8 @@ module Crystal::System::Thread end end - # the suspend/resume signals follow BDWGC + # the suspend/resume signals try to follow BDWGC but aren't exact (e.g. it may + # use SIGUSR1 and SIGUSR2 on FreeBSD instead of SIGRT). private SIG_SUSPEND = {% if flag?(:linux) %} @@ -266,6 +267,22 @@ module Crystal::System::Thread {% else %} LibC::SIGXCPU {% end %} + + def self.sig_suspend : ::Signal + if GC.responds_to?(:sig_suspend) + GC.sig_suspend + else + ::Signal.new(SIG_SUSPEND) + end + end + + def self.sig_resume : ::Signal + if GC.responds_to?(:sig_resume) + GC.sig_resume + else + ::Signal.new(SIG_RESUME) + end + end end # In musl (alpine) the calls to unwind API segfaults diff --git a/src/gc/boehm.cr b/src/gc/boehm.cr index 41c0f43f2a8c..33d6466d792b 100644 --- a/src/gc/boehm.cr +++ b/src/gc/boehm.cr @@ -164,6 +164,8 @@ lib LibGC fun stop_world_external = GC_stop_world_external fun start_world_external = GC_start_world_external + fun get_suspend_signal = GC_get_suspend_signal : Int + fun get_thr_restart_signal = GC_get_thr_restart_signal : Int end module GC @@ -483,4 +485,16 @@ module GC def self.start_world : Nil LibGC.start_world_external end + + {% if flag?(:unix) %} + # :nodoc: + def self.sig_suspend : Signal + Signal.new(LibGC.get_suspend_signal) + end + + # :nodoc: + def self.sig_resume : Signal + Signal.new(LibGC.get_thr_restart_signal) + end + {% end %} end