Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix thread safety for pybind11 loader_life_support #3237

Merged
merged 28 commits into from
Sep 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
53b3919
Fix thread safety for pybind11 loader_life_support
laramiel Sep 2, 2021
caa974f
Also update the internals version as the internal struct is no longer…
laramiel Sep 2, 2021
9179d60
Add test demonstrating threading works correctly.
laramiel Sep 3, 2021
0c2bf55
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 3, 2021
4b7dc7a
Update test to use lifetime-extended references rather than
laramiel Sep 3, 2021
fdfff88
Merge branch 'master' of https://github.com/laramiel/pybind11
laramiel Sep 3, 2021
d91a4a7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 3, 2021
c6720ca
Make loader_life_support members private
laramiel Sep 3, 2021
2c07c0d
Merge branch 'master' of https://github.com/laramiel/pybind11
laramiel Sep 3, 2021
8a1a59f
Update version to dev2
laramiel Sep 3, 2021
dfc94f3
Update test to use python threading rather than concurrent.futures
laramiel Sep 3, 2021
4f25e31
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 3, 2021
366f40d
Remove unnecessary env in test
laramiel Sep 3, 2021
b5a0538
Remove unnecessary pytest in test
laramiel Sep 3, 2021
ffc52a3
Use native C++ thread_local in place of python per-thread data struct…
laramiel Sep 3, 2021
dc7df66
clang-format test_thread.cpp
laramiel Sep 3, 2021
dd8f264
Add a note about debugging the py::cast() error
laramiel Sep 3, 2021
c4c6acb
thread_test.py now propagates exceptions on join() calls.
laramiel Sep 7, 2021
6ad3de6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 7, 2021
d7e3067
remove unused sys / merge
laramiel Sep 7, 2021
638d091
Update include order in test_thread.cpp
laramiel Sep 9, 2021
a06f851
Remove spurious whitespace
laramiel Sep 9, 2021
5c58953
Update comment / whitespace.
laramiel Sep 9, 2021
5f66855
Address review comments
laramiel Sep 9, 2021
1237bbe
lint cleanup
laramiel Sep 9, 2021
afbc066
Fix test IntStruct constructor.
laramiel Sep 9, 2021
fe49b37
Merge branch 'master' of https://github.com/pybind/pybind11
Skylion007 Sep 10, 2021
5787104
Add explicit to constructor
Skylion007 Sep 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/pybind11/detail/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

#define PYBIND11_VERSION_MAJOR 2
#define PYBIND11_VERSION_MINOR 8
#define PYBIND11_VERSION_PATCH 0.dev1
#define PYBIND11_VERSION_PATCH 0.dev2

// Similar to Python's convention: https://docs.python.org/3/c-api/apiabiversion.html
// Additional convention: 0xD = dev
#define PYBIND11_VERSION_HEX 0x020800D1
#define PYBIND11_VERSION_HEX 0x020800D2

#define PYBIND11_NAMESPACE_BEGIN(name) namespace name {
#define PYBIND11_NAMESPACE_END(name) }
Expand Down
6 changes: 3 additions & 3 deletions include/pybind11/detail/internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ struct internals {
std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
std::forward_list<ExceptionTranslator> registered_exception_translators;
std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
std::vector<PyObject *> unused_loader_patient_stack_remove_at_v5;
std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
PyTypeObject *static_property_type;
PyTypeObject *default_metaclass;
Expand Down Expand Up @@ -298,12 +298,12 @@ PYBIND11_NOINLINE internals &get_internals() {
#if PY_VERSION_HEX >= 0x03070000
internals_ptr->tstate = PyThread_tss_alloc();
if (!internals_ptr->tstate || (PyThread_tss_create(internals_ptr->tstate) != 0))
pybind11_fail("get_internals: could not successfully initialize the TSS key!");
pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
PyThread_tss_set(internals_ptr->tstate, tstate);
#else
internals_ptr->tstate = PyThread_create_key();
if (internals_ptr->tstate == -1)
pybind11_fail("get_internals: could not successfully initialize the TLS key!");
pybind11_fail("get_internals: could not successfully initialize the tstate TLS key!");
PyThread_set_key_value(internals_ptr->tstate, tstate);
#endif
internals_ptr->istate = tstate->interp;
Expand Down
55 changes: 31 additions & 24 deletions include/pybind11/detail/type_caster_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,47 +31,54 @@ PYBIND11_NAMESPACE_BEGIN(detail)
/// A life support system for temporary objects created by `type_caster::load()`.
/// Adding a patient will keep it alive up until the enclosing function returns.
class loader_life_support {
private:
loader_life_support* parent = nullptr;
std::unordered_set<PyObject *> keep_alive;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably better to use pybind11::object here, as then the reference counting doesn't have to be done manually.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: Previously the possibility of using PySet was discussed, but that cannot be used since if the type defines a custom hash and equality function then it won't work correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately py::object is not hashable, so that would require intrusive adaptors or similar. Since the control flow here is so limited, I think that the simplest answer is to just use PyObject* here and refcount it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good point.


static loader_life_support** get_stack_pp() {
#if defined(WITH_THREAD)
thread_local static loader_life_support* per_thread_stack = nullptr;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It occurred to me that an unfortunate effect of using C++ thread_local is that every single pybind11 extension module will add an additional TLS variable --- that could make the thread-local storage rather large, as there can easily be a large number of extension modules.

To avoid that, the Python TLS API could be used instead --- the key that is allocated would have to be stored in the internals struct (which would make the ABI incompatible), or if we really want to maintain ABI compatibility, could be accessed via a separate PyCapsule that is handled in the same way as the existing internals struct. Probably it would make more sense to just break the ABI, and take the opportunity to merge in the other ABI-breaking changes, though I don't have sufficient context to really offer much judgement on that decision.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And why does that concern you? I suspect that many extensions may already use thread_local variables without this concern.

The problem with using the python TLS API is that without versioning the internal data structure we're left with the same issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some extensions may already use TLS, but I expect that is only a tiny fraction of them, and presumably it is providing useful functionality.

This will add an extra separate TLS variable for every pybind11 extension module (due to the use of -fvisibility=hidden, they will not be merged). As you can easily have hundreds of extension modules loaded in a program, this is effectively adding potentially a very large number of TLS variables, and the memory usage scales with number_of_threads_in_program * number_of_pybind11_extensions. Furthermore, with #3257 taking care of the string_view use case, the main remaining use case is pybind11::implicitly_convertible, which I think is a relatively niche feature.

Yes, the Python TLS API only helps if we ensure there is a single key shared by all pybind11 extensions. As far as versioning the internals data structure, note that pybind11 already has such a mechanism, it is just a matter of bumping the abi version number --- the cost is that the type registry is not shared between different ABI versions, which may create an incompatibility between extensions compiled. The internals data is currently accessed via a PyCapsule set as an attribute of the builtins module in the Python interpreter. To avoid breaking the ABI, it would be possible to create a separate PyCapsule using the same mechanism that holds any additional data, in this case just the single tls key.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're talking about potentially hundreds of instances, which is admittedly an outlier.

As it stands now, I'd rather that we get this in to fix the threading issue, which I see as significant, and then we can rework it with an API version revision to use a shared python TLS variable or a pycapsule.

I'd prefer broader consensus w.r.t. the pycapsule encapsulation, though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think hundreds is an outlier. For example, tensorflow alone seems to have ~76.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another issue that occurred to me is that if you call py::cast from translation unit a.cc, and that call is nested within an invocation of a pybind11-bound function defined in translation unit b.cc, and these two translation units have separate copies of pybind11 due to -fvisibility=hidden, then previously this would have worked (albeit not thread safe) but now it will fail.

For example, within Google's code base, this could occur if we define some utility function in a pybind_cc_library target that calls py::cast, then use that utility function from within a pybind11-bound function defined in a pybind_extension. All of the pybind_cc_library targets will share a single copy of the thread_local, while each pybind_extension will have its own copy of the thread_local.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If in the implementation of such a call you call pybind11::cast in such a way that a loader_life_support temporary is necessary then you would need to add a scoped instance of loader_life_support in the -fvisibility=hidden method implementation. I think that this is a really rare edge case, and as I said, I think that the state of the world here is vastly better than before, and we can change the implementation with an internal API bump.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that there is already a shared_data member of the internals struct that could be used to store the TLS key without breaking ABI --- a macro could control whether to use that or a regular data member of the internals struct. For efficiency the TLS key could also be copied to the local_internals struct. Then when ABI is bumped the mechanism using shared_data could be removed.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just my two cents - I would say that this is a critical bug, of which the impact is far worse than any of the aforementioned side effects/edge cases. The team I work with have taken a fork of pybind11 and applied this fix so that we can move forward - as we have a strong requirement to run IO-bound and computationally expensive functions in parallel. Without this fix, we cannot reliably do so. I would second @laramiel's suggestion to get this fix in, and then rework/optimise later if required.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have any objection to merging this as is, especially given that there appears to be a relatively easy path to revising it to avoid duplicating the TLS variables without introducing an ABI break.

return &per_thread_stack;
#else
static loader_life_support* global_stack = nullptr;
return &global_stack;
#endif
}

public:
/// A new patient frame is created when a function is entered
loader_life_support() {
get_internals().loader_patient_stack.push_back(nullptr);
loader_life_support** stack = get_stack_pp();
parent = *stack;
*stack = this;
}

/// ... and destroyed after it returns
~loader_life_support() {
auto &stack = get_internals().loader_patient_stack;
if (stack.empty())
loader_life_support** stack = get_stack_pp();
if (*stack != this)
pybind11_fail("loader_life_support: internal error");

auto ptr = stack.back();
stack.pop_back();
Py_CLEAR(ptr);

// A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
if (stack.capacity() > 16 && !stack.empty() && stack.capacity() / stack.size() > 2)
stack.shrink_to_fit();
*stack = parent;
for (auto* item : keep_alive)
Py_DECREF(item);
}

/// This can only be used inside a pybind11-bound function, either by `argument_loader`
/// at argument preparation time or by `py::cast()` at execution time.
PYBIND11_NOINLINE static void add_patient(handle h) {
auto &stack = get_internals().loader_patient_stack;
if (stack.empty())
loader_life_support* frame = *get_stack_pp();
if (!frame) {
// NOTE: It would be nice to include the stack frames here, as this indicates
// use of pybind11::cast<> outside the normal call framework, finding such
// a location is challenging. Developers could consider printing out
// stack frame addresses here using something like __builtin_frame_address(0)
throw cast_error("When called outside a bound function, py::cast() cannot "
"do Python -> C++ conversions which require the creation "
"of temporary values");

auto &list_ptr = stack.back();
if (list_ptr == nullptr) {
list_ptr = PyList_New(1);
if (!list_ptr)
pybind11_fail("loader_life_support: error allocating list");
PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
} else {
auto result = PyList_Append(list_ptr, h.ptr());
if (result == -1)
pybind11_fail("loader_life_support: error adding patient");
}

if (frame->keep_alive.insert(h.ptr()).second)
Py_INCREF(h.ptr());
}
};

Expand Down
2 changes: 1 addition & 1 deletion pybind11/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ def _to_int(s):
return s


__version__ = "2.8.0.dev1"
__version__ = "2.8.0.dev2"
version_info = tuple(_to_int(s) for s in __version__.split("."))
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ set(PYBIND11_TEST_FILES
test_stl.cpp
test_stl_binders.cpp
test_tagbased_polymorphic.cpp
test_thread.cpp
test_union.cpp
test_virtual_functions.cpp)

Expand Down
66 changes: 66 additions & 0 deletions tests/test_thread.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
tests/test_thread.cpp -- call pybind11 bound methods in threads
Copyright (c) 2021 Laramie Leavitt (Google LLC) <lar@google.com>
All rights reserved. Use of this source code is governed by a
BSD-style license that can be found in the LICENSE file.
*/

#include <pybind11/cast.h>
#include <pybind11/pybind11.h>

#include <chrono>
#include <thread>

#include "pybind11_tests.h"

namespace py = pybind11;

namespace {

struct IntStruct {
explicit IntStruct(int v) : value(v) {};
~IntStruct() { value = -value; }
IntStruct(const IntStruct&) = default;
IntStruct& operator=(const IntStruct&) = default;

int value;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could add a destructor that modified value or modifies some global variable, so that we aren't relying on debugging features of the memory allocator as much.

Separately, perhaps add a note that this test should be run with asan for greater effectiveness.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

};

} // namespace

TEST_SUBMODULE(thread, m) {

py::class_<IntStruct>(m, "IntStruct").def(py::init([](const int i) { return IntStruct(i); }));

// implicitly_convertible uses loader_life_support when an implicit
// conversion is required in order to lifetime extend the reference.
//
// This test should be run with ASAN for better effectiveness.
py::implicitly_convertible<int, IntStruct>();

m.def("test", [](int expected, const IntStruct &in) {
{
py::gil_scoped_release release;
std::this_thread::sleep_for(std::chrono::milliseconds(5));
}

if (in.value != expected) {
throw std::runtime_error("Value changed!!");
}
});

m.def(
"test_no_gil",
[](int expected, const IntStruct &in) {
std::this_thread::sleep_for(std::chrono::milliseconds(5));
if (in.value != expected) {
throw std::runtime_error("Value changed!!");
}
},
py::call_guard<py::gil_scoped_release>());

// NOTE: std::string_view also uses loader_life_support to ensure that
// the string contents remain alive, but that's a C++ 17 feature.
}
44 changes: 44 additions & 0 deletions tests/test_thread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-

import threading

from pybind11_tests import thread as m


class Thread(threading.Thread):
def __init__(self, fn):
super(Thread, self).__init__()
self.fn = fn
self.e = None

def run(self):
try:
for i in range(10):
self.fn(i, i)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I avoid implicit cast (and triggering life support) by changing this line to self.fn(i, m.IntStruct(i))?

except Exception as e:
self.e = e

def join(self):
super(Thread, self).join()
if self.e:
raise self.e


def test_implicit_conversion():
a = Thread(m.test)
b = Thread(m.test)
c = Thread(m.test)
for x in [a, b, c]:
x.start()
for x in [c, b, a]:
x.join()


def test_implicit_conversion_no_gil():
a = Thread(m.test_no_gil)
b = Thread(m.test_no_gil)
c = Thread(m.test_no_gil)
for x in [a, b, c]:
x.start()
for x in [c, b, a]:
x.join()