Skip to content

Commit

Permalink
Fix race condition in free-threaded Python (fixes issue #867)
Browse files Browse the repository at this point in the history
This commit addresses an issue arising when multiple threads want to
access the Python object associated with the same C++ instance, which
does not exist yet and therefore must be created. @vfdev-5 reported that
TSAN detects a race condition in code that uses this pattern, caused by
concurrent unprotected reads/writes of internal ``nb_inst`` fields.

There is also a larger problem: depending on how operations are
sequenced, it is possible that two threads simultaneously create a
Python wrapper, which violates the usual invariant that each (C++
instance pointer, type) pair maps to at most one Python object.

This PR updates nanobind to preserve this invariant. When registering a
newly created wrapper object in the internal data structures, nanobind
checks if another equivalent wrapper has been created in the meantime.
If so, we destroy the thread's instance and return the registered one.

This requires some extra handling code, that, however, only runs with
very low probability. It also adds a new ``registered`` bit flag to
``nb_inst``, which makes it possible to have ``nb_inst`` objects that
aren't registered in the internal data structures. I am planning to use
that feature to fix the (unrelated) issue #879.
  • Loading branch information
wjakob committed Jan 28, 2025
1 parent 534fd8c commit 1415e05
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 16 deletions.
17 changes: 14 additions & 3 deletions src/nb_internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,11 @@ struct nb_inst { // usually: 24 bytes
/// Does this instance use intrusive reference counting?
uint32_t intrusive : 1;

/// Is this instance registered in the nanobind instance map?
uint32_t registered : 1;

// That's a lot of unused space. I wonder if there is a good use for it..
uint32_t unused : 24;
uint32_t unused : 23;
};

static_assert(sizeof(nb_inst) == sizeof(PyObject) + sizeof(uint32_t) * 2);
Expand Down Expand Up @@ -489,8 +492,16 @@ template <typename T> struct scoped_pymalloc {
#if defined(NB_FREE_THREADED)
struct lock_shard {
nb_shard &s;
lock_shard(nb_shard &s) : s(s) { PyMutex_Lock(&s.mutex); }
~lock_shard() { PyMutex_Unlock(&s.mutex); }
bool active;
lock_shard(nb_shard &s) : s(s), active(true) { PyMutex_Lock(&s.mutex); }
~lock_shard() { unlock(); }

void unlock() {
if (active) {
PyMutex_Unlock(&s.mutex);
active = false;
}
}
};
struct lock_internals {
nb_internals *i;
Expand Down
57 changes: 48 additions & 9 deletions src/nb_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ PyObject *inst_new_int(PyTypeObject *tp, PyObject * /* args */,
self->cpp_delete = 0;
self->clear_keep_alive = 0;
self->intrusive = intrusive;
self->registered = 1;
self->unused = 0;

// Make the object compatible with nb_try_inc_ref (free-threaded builds only)
nb_enable_try_inc_ref((PyObject *) self);

// Update hash table that maps from C++ to Python instance
Expand All @@ -111,7 +114,9 @@ PyObject *inst_new_int(PyTypeObject *tp, PyObject * /* args */,
return (PyObject *) self;
}

/// Allocate memory for a nb_type instance with external storage
/// Allocate memory for a nb_type instance with external storage. In contrast to
/// 'inst_new_int()', this does not yet register the instance in the internal
/// data structures. The function 'inst_register()' must be used to do so.
PyObject *inst_new_ext(PyTypeObject *tp, void *value) {
bool gc = PyType_HasFeature(tp, Py_TPFLAGS_HAVE_GC);

Expand Down Expand Up @@ -164,14 +169,27 @@ PyObject *inst_new_ext(PyTypeObject *tp, void *value) {
self->cpp_delete = 0;
self->clear_keep_alive = 0;
self->intrusive = intrusive;

// We already set this flag to 1 here so that we don't have to change it again
// afterwards. This requires that the call to 'inst_new_ext' is paired with
// a call to 'inst_register()'

self->registered = 1;
self->unused = 0;

// Make the object compatible with nb_try_inc_ref (free-threaded builds only)
nb_enable_try_inc_ref((PyObject *) self);

return (PyObject *) self;
}

/// Register the object constructed by 'inst_new_ext()' in the internal data structures
static nb_inst *inst_register(nb_inst *inst, void *value) noexcept {
nb_shard &shard = internals->shard(value);
lock_shard guard(shard);

// Update hash table that maps from C++ to Python instance
auto [it, success] = shard.inst_c2p.try_emplace(value, self);
auto [it, success] = shard.inst_c2p.try_emplace(value, inst);

if (NB_UNLIKELY(!success)) {
void *entry = it->second;
Expand All @@ -186,27 +204,45 @@ PyObject *inst_new_ext(PyTypeObject *tp, void *value) {
entry = it.value() = nb_mark_seq(first);
}

PyTypeObject *tp = Py_TYPE(inst);
nb_inst_seq *seq = nb_get_seq(entry);
while (true) {
check((nb_inst *) seq->inst != self,
"nanobind::detail::inst_new_ext(): duplicate instance!");
nb_inst *inst_2 = (nb_inst *) seq->inst;
PyTypeObject *tp_2 = Py_TYPE(inst_2);

// The following should never happen
check(inst_2 != inst, "nanobind::detail::inst_new_ext(): duplicate instance!");

// In the case of concurrent execution, another thread might have created an
// identical instance wrapper in the meantime. Let's return that one then.
if (tp == tp_2 &&
!(nb_type_data(tp_2)->flags & (uint32_t) type_flags::is_python_type) &&
nb_try_inc_ref((PyObject *) inst_2)) {
inst->destruct = inst->cpp_delete = inst->registered = false;
guard.unlock();
Py_DECREF(inst);
return inst_2;
}

if (!seq->next)
break;

seq = seq->next;
}

nb_inst_seq *next = (nb_inst_seq *) PyMem_Malloc(sizeof(nb_inst_seq));
check(next,
"nanobind::detail::inst_new_ext(): list element allocation failed!");

next->inst = (PyObject *) self;
next->inst = (PyObject *) inst;
next->next = nullptr;
seq->next = next;
}

return (PyObject *) self;
return inst;
}


static void inst_dealloc(PyObject *self) {
PyTypeObject *tp = Py_TYPE(self);
const type_data *t = nb_type_data(tp);
Expand Down Expand Up @@ -253,7 +289,7 @@ static void inst_dealloc(PyObject *self) {

nb_weakref_seq *wr_seq = nullptr;

{
if (inst->registered) {
// Enter critical section of shard
nb_shard &shard = internals->shard(p);
lock_shard guard(shard);
Expand Down Expand Up @@ -1737,6 +1773,9 @@ static PyObject *nb_type_put_common(void *value, type_data *t, rv_policy rvp,
if (intrusive)
t->set_self_py(new_value, (PyObject *) inst);

if (!create_new)
inst = inst_register(inst, value);

return (PyObject *) inst;
}

Expand Down Expand Up @@ -2082,7 +2121,7 @@ PyObject *nb_inst_reference(PyTypeObject *t, void *ptr, PyObject *parent) {
nbi->state = nb_inst::state_ready;
if (parent)
keep_alive(result, parent);
return result;
return (PyObject *) inst_register(nbi, ptr);
}

PyObject *nb_inst_take_ownership(PyTypeObject *t, void *ptr) {
Expand All @@ -2092,7 +2131,7 @@ PyObject *nb_inst_take_ownership(PyTypeObject *t, void *ptr) {
nb_inst *nbi = (nb_inst *) result;
nbi->destruct = nbi->cpp_delete = true;
nbi->state = nb_inst::state_ready;
return result;
return (PyObject *) inst_register(nbi, ptr);
}

void *nb_inst_ptr(PyObject *o) noexcept {
Expand Down
8 changes: 4 additions & 4 deletions tests/test_thread.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test02_global_lock(n_threads=8):
n = 100000
c = Counter()
def f():
for i in range(n):
for _ in range(n):
t.inc_global(c)

parallelize(f, n_threads=n_threads)
Expand All @@ -53,7 +53,7 @@ def test04_locked_function(n_threads=8):
n = 100000
c = Counter()
def f():
for i in range(n):
for _ in range(n):
t.inc_safe(c)

parallelize(f, n_threads=n_threads)
Expand All @@ -77,11 +77,11 @@ def f():
assert c.value == n * n_threads


def test_06_global_wrapper(n_threads=8):
def test06_global_wrapper(n_threads=8):
# Check wrapper lookup racing with wrapper deallocation
n = 10000
def f():
for i in range(n):
for _ in range(n):
GlobalData.get()
GlobalData.get()
GlobalData.get()
Expand Down

0 comments on commit 1415e05

Please sign in to comment.