Skip to content

Commit

Permalink
Eliminate duplicate TLS keys for loader_life_support stack
Browse files Browse the repository at this point in the history
This revises the existing fix for
#2765 in
#3237 to reduce the amount of
TLS storage used.

The shared TLS key is stored in two different ways, depending on
`PYBIND11_INTERNALS_VERSION`.  If `PYBIND11_INTERNALS_VERSION ==
4` (as is currently set), the TLS key is stored in the
`internal::shared_data` map to avoid breaking ABI compatibility.  If
`PYBIND11_INTERNALS_VERSION > 4`, the TLS key is stored directly in
the `internals` struct.
  • Loading branch information
jbms committed Sep 16, 2021
1 parent e0031bf commit c19b27b
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 60 deletions.
155 changes: 110 additions & 45 deletions include/pybind11/detail/internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,19 @@

#include "../pytypes.h"

/// Tracks the `internals` and `type_info` ABI version independent of the main library version.
///
/// Some portions of the code use an ABI that is conditional depending on this
/// version number. That allows ABI-breaking changes to be "pre-implemented".
/// Once the default version number is incremented, the conditional logic that
/// no longer applies can be removed. Additionally, users that need not
/// maintain ABI compatibility can increase the version number in order to take
/// advantage of any functionality/efficiency improvements that depend on the
/// newer ABI.
#ifndef PYBIND11_INTERNALS_VERSION
#define PYBIND11_INTERNALS_VERSION 4
#endif

PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)

using ExceptionTranslator = void (*)(std::exception_ptr);
Expand All @@ -25,30 +38,44 @@ inline PyObject *make_object_base_type(PyTypeObject *metaclass);
// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
// Thread Specific Storage (TSS) API.
#if PY_VERSION_HEX >= 0x03070000
# define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
# define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
# define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
# define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
// Avoid unnecessary allocation of `Py_tss_t`, since we cannot use
// `Py_LIMITED_API` anyway.
# if PYBIND11_INTERNALS_VERSION > 4
# define PYBIND11_TLS_KEY_REF Py_tss_t &
# define PYBIND11_TLS_KEY_INIT(var) Py_tss_t var = Py_tss_NEEDS_INIT
# define PYBIND11_TLS_KEY_CREATE(var) (PyThread_tss_create(&(var)) == 0)
# define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get(&(key))
# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set(&(key), (value))
# define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set(&(key), nullptr)
# define PYBIND11_TLS_FREE(key) PyThread_tss_delete(&(key))
# else
# define PYBIND11_TLS_KEY_REF Py_tss_t *
# define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
# define PYBIND11_TLS_KEY_CREATE(var) \
(((var) = PyThread_tss_alloc()) != nullptr && (PyThread_tss_create((var)) == 0))
# define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
# define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
# define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
# endif
#else
// Usually an int but a long on Cygwin64 with Python 3.x
# define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
// Usually an int but a long on Cygwin64 with Python 3.x
# define PYBIND11_TLS_KEY_REF decltype(PyThread_create_key())
# define PYBIND11_TLS_KEY_INIT(var) PYBIND11_TLS_KEY_REF var = 0
# define PYBIND11_TLS_KEY_CREATE(var) (((var) = PyThread_create_key()) != -1)
# define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
# if PY_MAJOR_VERSION < 3
# define PYBIND11_TLS_DELETE_VALUE(key) \
PyThread_delete_key_value(key)
# define PYBIND11_TLS_REPLACE_VALUE(key, value) \
do { \
PyThread_delete_key_value((key)); \
PyThread_set_key_value((key), (value)); \
} while (false)
# define PYBIND11_TLS_DELETE_VALUE(key) PyThread_delete_key_value(key)
# define PYBIND11_TLS_REPLACE_VALUE(key, value) \
do { \
PyThread_delete_key_value((key)); \
PyThread_set_key_value((key), (value)); \
} while (false)
# else
# define PYBIND11_TLS_DELETE_VALUE(key) \
PyThread_set_key_value((key), nullptr)
# define PYBIND11_TLS_REPLACE_VALUE(key, value) \
PyThread_set_key_value((key), (value))
# define PYBIND11_TLS_DELETE_VALUE(key) PyThread_set_key_value((key), nullptr)
# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_set_key_value((key), (value))
# endif
# define PYBIND11_TLS_FREE(key) (void)key
# define PYBIND11_TLS_FREE(key) (void) key
#endif

// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
Expand Down Expand Up @@ -106,22 +133,31 @@ struct internals {
std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
std::forward_list<ExceptionTranslator> registered_exception_translators;
std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
#if PYBIND11_INTERNALS_VERSION == 4
std::vector<PyObject *> unused_loader_patient_stack_remove_at_v5;
#endif
std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
PyTypeObject *static_property_type;
PyTypeObject *default_metaclass;
PyObject *instance_base;
#if defined(WITH_THREAD)
PYBIND11_TLS_KEY_INIT(tstate);
# if PYBIND11_INTERNALS_VERSION > 4
PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key);
# endif // PYBIND11_INTERNALS_VERSION > 4
PyInterpreterState *istate = nullptr;
~internals() {
# if PYBIND11_INTERNALS_VERSION > 4
PYBIND11_TLS_FREE(loader_life_support_tls_key);
# endif // PYBIND11_INTERNALS_VERSION > 4

// This destructor is called *after* Py_Finalize() in finalize_interpreter().
// That *SHOULD BE* fine. The following details what happens when PyThread_tss_free is called.
// PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does nothing.
// PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
// PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX). Neither
// of those have anything to do with CPython internals.
// PyMem_RawFree *requires* that the `tstate` be allocated with the CPython allocator.
// That *SHOULD BE* fine. The following details what happens when PyThread_tss_free is
// called. PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does
// nothing. PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
// PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX).
// Neither of those have anything to do with CPython internals. PyMem_RawFree *requires*
// that the `tstate` be allocated with the CPython allocator.
PYBIND11_TLS_FREE(tstate);
}
#endif
Expand Down Expand Up @@ -153,9 +189,6 @@ struct type_info {
bool module_local : 1;
};

/// Tracks the `internals` and `type_info` ABI version independent of the main library version
#define PYBIND11_INTERNALS_VERSION 4

/// On MSVC, debug and release builds are not ABI-compatible!
#if defined(_MSC_VER) && defined(_DEBUG)
# define PYBIND11_BUILD_TYPE "_debug"
Expand Down Expand Up @@ -291,21 +324,21 @@ PYBIND11_NOINLINE internals &get_internals() {
internals_ptr = new internals();
#if defined(WITH_THREAD)

#if PY_VERSION_HEX < 0x03090000
PyEval_InitThreads();
#endif
# if PY_VERSION_HEX < 0x03090000
PyEval_InitThreads();
# endif
PyThreadState *tstate = PyThreadState_Get();
#if PY_VERSION_HEX >= 0x03070000
internals_ptr->tstate = PyThread_tss_alloc();
if (!internals_ptr->tstate || (PyThread_tss_create(internals_ptr->tstate) != 0))
pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
PyThread_tss_set(internals_ptr->tstate, tstate);
#else
internals_ptr->tstate = PyThread_create_key();
if (internals_ptr->tstate == -1)
pybind11_fail("get_internals: could not successfully initialize the tstate TLS key!");
PyThread_set_key_value(internals_ptr->tstate, tstate);
#endif
if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) {
pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
}
PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate);

# if PYBIND11_INTERNALS_VERSION > 4
if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) {
pybind11_fail("get_internals: could not successfully initialize the "
"loader_life_support TSS key!");
}
# endif
internals_ptr->istate = tstate->interp;
#endif
builtins[id] = capsule(internals_pp);
Expand All @@ -317,16 +350,48 @@ PYBIND11_NOINLINE internals &get_internals() {
return **internals_pp;
}


// the internals struct (above) is shared between all the modules. local_internals are only
// for a single module. Any changes made to internals may require an update to
// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
// restricted to a single module. Whether a module has local internals or not should not
// impact any other modules, because the only things accessing the local internals is the
// module that contains them.
struct local_internals {
type_map<type_info *> registered_types_cpp;
std::forward_list<ExceptionTranslator> registered_exception_translators;
type_map<type_info *> registered_types_cpp;
std::forward_list<ExceptionTranslator> registered_exception_translators;
#if defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4

// For ABI compatibility, we can't store the loader_life_support TLS key in
// the `internals` struct directly. Instead, we store it in `shared_data` and
// cache a copy in `local_internals`. If we allocated a separate TLS key for
// each instance of `local_internals`, we could end up allocating hundreds of
// TLS keys if hundreds of different pybind11 modules are loaded (which is a
// plausible number).
PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key);

// Holds the shared TLS key for the loader_life_support stack.
struct shared_loader_life_support_data {
PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key);
shared_loader_life_support_data() {
if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) {
pybind11_fail("local_internals: could not successfully initialize the "
"loader_life_support TLS key!");
}
}
// We can't help but leak the TLS key, because Python never unloads extension modules.
};

local_internals() {
auto &internals = get_internals();
// Get or create the `loader_life_support_stack_key`.
auto &ptr = internals.shared_data["_life_support"];
if (!ptr) {
ptr = new shared_loader_life_support_data;
}
loader_life_support_tls_key
= static_cast<shared_loader_life_support_data *>(ptr)->loader_life_support_tls_key;
}
#endif // defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4
};

/// Works like `get_internals`, but for things which are locally registered.
Expand Down
39 changes: 26 additions & 13 deletions include/pybind11/detail/type_caster_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,38 +35,51 @@ class loader_life_support {
loader_life_support* parent = nullptr;
std::unordered_set<PyObject *> keep_alive;

static loader_life_support** get_stack_pp() {
#if defined(WITH_THREAD)
thread_local static loader_life_support* per_thread_stack = nullptr;
return &per_thread_stack;
// Store stack pointer in thread-local storage.
static PYBIND11_TLS_KEY_REF get_stack_tls_key() {
# if PYBIND11_INTERNALS_VERSION == 4
return get_local_internals().loader_life_support_tls_key;
# else
return get_internals().loader_life_support_tls_key;
# endif
}
static loader_life_support *get_stack_top() {
return static_cast<loader_life_support *>(PYBIND11_TLS_GET_VALUE(get_stack_tls_key()));
}
static void set_stack_top(loader_life_support *value) {
PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value);
}
#else
static loader_life_support* global_stack = nullptr;
return &global_stack;
#endif
// Use single global variable for stack.
static loader_life_support **get_stack_pp() {
static loader_life_support *global_stack = nullptr;
return global_stack;
}
static loader_life_support *get_stack_top() { return *get_stack_pp(); }
static void set_stack_top(loader_life_support *value) { *get_stack_pp() = value; }
#endif

public:
/// A new patient frame is created when a function is entered
loader_life_support() {
loader_life_support** stack = get_stack_pp();
parent = *stack;
*stack = this;
parent = get_stack_top();
set_stack_top(this);
}

/// ... and destroyed after it returns
~loader_life_support() {
loader_life_support** stack = get_stack_pp();
if (*stack != this)
if (get_stack_top() != this)
pybind11_fail("loader_life_support: internal error");
*stack = parent;
set_stack_top(parent);
for (auto* item : keep_alive)
Py_DECREF(item);
}

/// This can only be used inside a pybind11-bound function, either by `argument_loader`
/// at argument preparation time or by `py::cast()` at execution time.
PYBIND11_NOINLINE static void add_patient(handle h) {
loader_life_support* frame = *get_stack_pp();
loader_life_support *frame = get_stack_top();
if (!frame) {
// NOTE: It would be nice to include the stack frames here, as this indicates
// use of pybind11::cast<> outside the normal call framework, finding such
Expand Down
4 changes: 2 additions & 2 deletions include/pybind11/gil.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ PYBIND11_NAMESPACE_END(detail)
class gil_scoped_acquire {
public:
PYBIND11_NOINLINE gil_scoped_acquire() {
auto const &internals = detail::get_internals();
auto &internals = detail::get_internals();
tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);

if (!tstate) {
Expand Down Expand Up @@ -132,7 +132,7 @@ class gil_scoped_release {
// `get_internals()` must be called here unconditionally in order to initialize
// `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
// initialization race could occur as multiple threads try `gil_scoped_acquire`.
const auto &internals = detail::get_internals();
auto &internals = detail::get_internals();
tstate = PyEval_SaveThread();
if (disassoc) {
auto key = internals.tstate;
Expand Down

0 comments on commit c19b27b

Please sign in to comment.