-
Notifications
You must be signed in to change notification settings - Fork 234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Segfault during trampoline allocation when querying occupancy from multiple threads #707
Comments
What platform and Julia version is this? |
julia> versioninfo()
Julia Version 1.6.0-rc1
Commit a58bdd9010 (2021-02-06 15:49 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: AMD Ryzen 9 3900X 12-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-11.0.1 (ORCJIT, znver2)
Environment:
JULIA_DEPOT_PATH = /julia_depot:$JULIA_DEPOT_PATH
JULIA_PATH = /usr/local/julia
JULIA_NUM_THREADS = 24
JULIA_PKG_SERVER = https://mirrors.sjtug.sjtu.edu.cn/julia
|
This looks like JuliaLang/julia#38709; but we already have a lock around |
Could you try with the following patch to Julia: diff --git a/src/runtime_ccall.cpp b/src/runtime_ccall.cpp
index 0dd727749f..e6ba5bce04 100644
--- a/src/runtime_ccall.cpp
+++ b/src/runtime_ccall.cpp
@@ -268,9 +268,10 @@ static void trampoline_deleter(void **f)
free(nval);
}
+static jl_mutex_t trampoline_lock;
+
// Use of `cache` is not clobbered in JL_TRY
JL_GCC_IGNORE_START("-Wclobbered")
-// TODO: need a thread lock around the cache access parts of this function
extern "C" JL_DLLEXPORT
jl_value_t *jl_get_cfunction_trampoline(
// dynamic inputs:
@@ -284,6 +285,7 @@ jl_value_t *jl_get_cfunction_trampoline(
jl_value_t **vals)
{
// lookup (fobj, vals) in cache
+ JL_LOCK(&trampoline_lock);
if (!cache->table)
htable_new(cache, 1);
if (fill != jl_emptysvec) {
@@ -295,6 +297,7 @@ jl_value_t *jl_get_cfunction_trampoline(
}
}
void *tramp = ptrhash_get(cache, (void*)fobj);
+ JL_UNLOCK(&trampoline_lock);
if (tramp != HT_NOTFOUND) {
assert((jl_datatype_t*)jl_typeof(tramp) == result_type);
return (jl_value_t*)tramp;
@@ -347,10 +350,12 @@ jl_value_t *jl_get_cfunction_trampoline(
free(nval);
jl_rethrow();
}
+ JL_LOCK(&trampoline_lock);
tramp = trampoline_alloc();
((void**)result)[0] = tramp;
tramp = init_trampoline(tramp, nval);
ptrhash_put(cache, (void*)fobj, result);
+ JL_UNLOCK(&trampoline_lock);
return result;
}
JL_GCC_IGNORE_STOP
And the following for CUDA.jl: diff --git a/lib/cudadrv/occupancy.jl b/lib/cudadrv/occupancy.jl
index 64097df8..731c4392 100644
--- a/lib/cudadrv/occupancy.jl
+++ b/lib/cudadrv/occupancy.jl
@@ -60,12 +60,10 @@ function launch_configuration(fun::CuFunction; shmem::Union{Integer,Base.Callabl
elseif Sys.ARCH == :x86 || Sys.ARCH == :x86_64
shmem_cint = threads -> Cint(shmem(threads))
# `@cfunction` needs a lock currently, https://github.com/JuliaLang/julia/issues/38709
- cb = lock(_shmem_cb_lock) do
- @cfunction($shmem_cint, Cint, (Cint,))
- end
+ cb = @cfunction($shmem_cint, Cint, (Cint,))
cuOccupancyMaxPotentialBlockSize(blocks_ref, threads_ref, fun, cb, 0, max_threads)
else
lock(_shmem_cb_lock) do |
The call to |
Ah wait I can reproduce this, even with added locks: function doit()
a = rand(Int)
function f()
a += 1
a
end
cf = @cfunction $f Int ()
GC.@preserve cf begin
fptr = Base.unsafe_convert(Ptr{Cvoid}, cf)
b = ccall(fptr, Int, ())
@assert a == b
c = ccall(fptr, Int, ())
@assert a == c
@assert b+1 == c
end
end
@sync Threads.@threads for i = 1:2000000
doit()
end
|
More extensive version of the above locks implemented in JuliaLang/julia#39621. @norci, can you test this out? You can use the built binary from that PR: https://s3.amazonaws.com/julialangnightlies/assert_pretesting/linux/x64/1.7/julia-01cb47fa8e-linux64.tar.gz |
Yes. This patch fixed this issue. |
using the master branch.
I always get this error, in my project.
But I'm not able to reproduce it with a minimal code.
the operation is something like
Update:
It fails occasionally, when using a few
@async
tasks.log:
The text was updated successfully, but these errors were encountered: