From 3cea7677c47d004c6ae33cba26a362c15de10825 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:05:21 +0900 Subject: [PATCH 01/17] free-threading: CMake build sytem This commit adds the ``FREE_THREADED`` parameter to the CMake ``nanobind_add_module()`` command. It does not do anything for now besides defining ``NB_FREE_THREADED`` in C++ compilation units. --- .gitignore | 8 ++++++-- CMakeLists.txt | 11 ++++++----- cmake/nanobind-config.cmake | 26 ++++++++++++++++++++++++-- docs/api_cmake.rst | 7 +++++++ tests/CMakeLists.txt | 4 ++++ 5 files changed, 47 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 359fdefb4..5ea58f3a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,15 @@ libnanobind-static.a libnanobind-static-abi3.a -libnanobind.dylib +libnanobind-static-ft.a libnanobind.so -libnanobind-abi3.dylib libnanobind-abi3.so +libnanobind-ft.so +libnanobind.dylib +libnanobind-abi3.dylib +libnanobind-ft.dylib nanobind.dll nanobind-abi3.dll +nanobind-ft.dll libinter_module.dylib libinter_module.so diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ec067809..c47b471d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,13 +27,14 @@ endif() option(NB_CREATE_INSTALL_RULES "Create installation rules" ${NB_MASTER_PROJECT}) option(NB_USE_SUBMODULE_DEPS "Use the nanobind dependencies shipped as a git submodule of this repository" ON) -option(NB_TEST "Compile nanobind tests?" ${NB_MASTER_PROJECT}) -option(NB_TEST_STABLE_ABI "Test the stable ABI interface?" OFF) -option(NB_TEST_SHARED_BUILD "Build a shared nanobind library for the test suite?" OFF) -option(NB_TEST_CUDA "Force the use of the CUDA/NVCC compiler for testing purposes" OFF) +option(NB_TEST "Compile nanobind tests?" ${NB_MASTER_PROJECT}) +option(NB_TEST_STABLE_ABI "Test the stable ABI interface?" OFF) +option(NB_TEST_SHARED_BUILD "Build a shared nanobind library for the test suite?" OFF) +option(NB_TEST_CUDA "Force the use of the CUDA/NVCC compiler for testing purposes" OFF) +option(NB_TEST_FREE_THREADED "Build free-threaded extensions for the test suite?" ON) if (NOT MSVC) - option(NB_TEST_SANITZE "Build tests with address and undefined behavior sanitizers?" OFF) + option(NB_TEST_SANITIZE "Build tests with address and undefined behavior sanitizers?" OFF) endif() # --------------------------------------------------------------------------- diff --git a/cmake/nanobind-config.cmake b/cmake/nanobind-config.cmake index ec7ff583c..3ef847e4c 100644 --- a/cmake/nanobind-config.cmake +++ b/cmake/nanobind-config.cmake @@ -40,6 +40,9 @@ if(DEFINED NB_SOSABI) endif() endif() +# Extract Python version and extensions (e.g. free-threaded build) +string(REGEX REPLACE "[^-]*-([^-]*)-.*" "\\1" NB_ABI "${NB_SOABI}") + # If either suffix is missing, call Python to compute it if(NOT DEFINED NB_SUFFIX OR NOT DEFINED NB_SUFFIX_S) # Query Python directly to get the right suffix. @@ -72,6 +75,7 @@ endif() # Stash these for later use set(NB_SUFFIX ${NB_SUFFIX} CACHE INTERNAL "") set(NB_SUFFIX_S ${NB_SUFFIX_S} CACHE INTERNAL "") +set(NB_ABI ${NB_ABI} CACHE INTERNAL "") get_filename_component(NB_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) get_filename_component(NB_DIR "${NB_DIR}" PATH) @@ -201,13 +205,17 @@ function (nanobind_build_library TARGET_NAME) endif() if (WIN32) - if (${TARGET_NAME} MATCHES "abi3") + if (${TARGET_NAME} MATCHES "-abi3") target_link_libraries(${TARGET_NAME} PUBLIC Python::SABIModule) else() target_link_libraries(${TARGET_NAME} PUBLIC Python::Module) endif() endif() + if (TARGET_NAME MATCHES "-ft") + target_compile_definitions(${TARGET_NAME} PUBLIC NB_FREE_THREADED) + endif() + # Nanobind performs many assertion checks -- detailed error messages aren't # included in Release/MinSizeRel modes target_compile_definitions(${TARGET_NAME} PRIVATE @@ -297,7 +305,7 @@ endfunction() function(nanobind_add_module name) cmake_parse_arguments(PARSE_ARGV 1 ARG - "STABLE_ABI;NB_STATIC;NB_SHARED;PROTECT_STACK;LTO;NOMINSIZE;NOSTRIP;MUSL_DYNAMIC_LIBCPP" + "STABLE_ABI;FREE_THREADED;NB_STATIC;NB_SHARED;PROTECT_STACK;LTO;NOMINSIZE;NOSTRIP;MUSL_DYNAMIC_LIBCPP" "NB_DOMAIN" "") add_library(${name} MODULE ${ARG_UNPARSED_ARGUMENTS}) @@ -319,6 +327,12 @@ function(nanobind_add_module name) set(ARG_STABLE_ABI FALSE) endif() + if (NB_ABI MATCHES "t") + set(ARG_STABLE_ABI FALSE) + else(ARG_STABLE_ABI) + set(ARG_FREE_THREADED FALSE) + endif() + set(libname "nanobind") if (ARG_NB_STATIC) set(libname "${libname}-static") @@ -328,6 +342,10 @@ function(nanobind_add_module name) set(libname "${libname}-abi3") endif() + if (ARG_FREE_THREADED) + set(libname "${libname}-ft") + endif() + if (ARG_NB_DOMAIN AND ARG_NB_SHARED) set(libname ${libname}-${ARG_NB_DOMAIN}) endif() @@ -345,6 +363,10 @@ function(nanobind_add_module name) nanobind_extension(${name}) endif() + if (ARG_FREE_THREADED) + target_compile_definitions(${name} PRIVATE NB_FREE_THREADED) + endif() + target_link_libraries(${name} PRIVATE ${libname}) if (NOT ARG_PROTECT_STACK) diff --git a/docs/api_cmake.rst b/docs/api_cmake.rst index c87181761..17a15b67b 100644 --- a/docs/api_cmake.rst +++ b/docs/api_cmake.rst @@ -69,6 +69,11 @@ The high-level interface consists of just one CMake command: `__ build, making it possible to use a compiled extension across Python minor versions. The flag is ignored on Python versions older than < 3.12. + * - ``FREE_THREADED`` + - Compile an Python extension that opts into free-threaded (i.e., + GIL-less) Python behavior, which requires a special free-threaded + build of Python 3.13 or newer. The flag is ignored on unsupported + Python versions. * - ``NB_STATIC`` - Compile the core nanobind library as a static library. This simplifies redistribution but can increase the combined binary @@ -270,6 +275,8 @@ The various commands are described below: - Perform a static library build (without this suffix, a shared build is used) * - ``-abi3`` - Perform a stable ABI build targeting Python v3.12+. + * - ``-ft`` + - Perform a build that opts into the Python 3.13+ free-threaded behavior. .. code-block:: cmake diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d65947296..785db96d2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,6 +5,10 @@ if (NB_TEST_STABLE_ABI) set(NB_EXTRA_ARGS ${NB_EXTRA_ARGS} STABLE_ABI) endif() +if (NB_TEST_FREE_THREADED) + set(NB_EXTRA_ARGS ${NB_EXTRA_ARGS} FREE_THREADED) +endif() + if (NB_TEST_SHARED_BUILD) set(NB_EXTRA_ARGS ${NB_EXTRA_ARGS} NB_SHARED) endif() From e47d2c9cafb7d96c1da6a55727987da7647bbd26 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:06:15 +0900 Subject: [PATCH 02/17] free-threading: Add free-threaded CI job to the GHA build matrix --- .github/workflows/ci.yml | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4fa0c4455..c620ff24c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,4 +135,39 @@ jobs: run: cmake --build build -j 2 - name: Run tests - run: cd build && python3 -m pytest + run: > + cd build; + python -m pytest + + free-threaded: + name: "Python 3.13-dev / ubuntu.latest [free-threaded]" + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - uses: deadsnakes/action@v3.1.0 + with: + python-version: 3.13-dev + nogil: true + + - name: Install the latest CMake + uses: lukka/get-cmake@latest + + - name: Install PyTest + run: | + python -m pip install pytest pytest-github-actions-annotate-failures + + - name: Configure + run: > + cmake -S . -B build -DNB_TEST_FREE_THREADED=ON + + - name: Build C++ + run: cmake --build build -j 2 + + - name: Run tests + run: > + cd build; + python -m pytest From 84ce61ee2911fce21f91d077fce6bba5ed7995fb Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:10:46 +0900 Subject: [PATCH 03/17] free-threading: ABI separation Nanobind's internal data structures switch to a different layout when free-threading was requested. We must give such extensions a different ABI tag since they are incompatible with non-free-threaded ones. --- src/nb_internals.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/nb_internals.cpp b/src/nb_internals.cpp index 6060a79f7..cb032a56d 100644 --- a/src/nb_internals.cpp +++ b/src/nb_internals.cpp @@ -72,6 +72,12 @@ # define NB_STABLE_ABI "" #endif +#if defined(NB_FREE_THREADED) +# define NB_FREE_THREADED_ABI "_ft" +#else +# define NB_FREE_THREADED_ABI "" +#endif + #if NB_VERSION_DEV > 0 #define NB_VERSION_DEV_STR "_dev" NB_TOSTRING(NB_VERSION_DEV) #else @@ -80,7 +86,8 @@ #define NB_INTERNALS_ID \ "v" NB_TOSTRING(NB_INTERNALS_VERSION) \ - NB_COMPILER_TYPE NB_STDLIB NB_VERSION_DEV_STR NB_BUILD_ABI NB_BUILD_TYPE NB_STABLE_ABI + NB_COMPILER_TYPE NB_STDLIB NB_VERSION_DEV_STR NB_BUILD_ABI \ + NB_BUILD_TYPE NB_STABLE_ABI NB_FREE_THREADED_ABI NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) From 7d9aceed0967e5815a80c8191910fdb17c3b8769 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:12:43 +0900 Subject: [PATCH 04/17] free-threading: Abstractions around locking API This commit implements RAII scope guards that are thin wrappers around existing Python API to lock Python mutexes and enter critical sections with respect to one or two Python objects. In ordinary (non-free-threaded) Python builds, they don't do anything and can be optimized away. --- docs/api_core.rst | 93 ++++++++++++++++++++++++++++++++++++++ include/nanobind/nb_misc.h | 64 ++++++++++++++++++++++++-- 2 files changed, 152 insertions(+), 5 deletions(-) diff --git a/docs/api_core.rst b/docs/api_core.rst index 81f56a467..81f3a5a30 100644 --- a/docs/api_core.rst +++ b/docs/api_core.rst @@ -2552,10 +2552,103 @@ running it in parallel from multiple Python threads. Release the GIL (**must** be currently held) + In :ref:`free-threaded extensions `, this operation also + temporarily releases all :ref:`argument locks ` held by + the current thread. + .. cpp:function:: ~gil_scoped_release() Reacquire the GIL +Free-threading +-------------- + +Nanobind provides abstractions to implement *additional* locking that is +needed to ensure the correctness of free-threaded Python extensions. + +.. cpp:struct:: ft_mutex + + Object-oriented wrapper representing a `PyMutex + `__. It can be + slightly more efficient than OS/language-provided primitives (e.g., + ``std::thread``, ``pthread_mutex_t``) and should generally be preferred when + adding critical sections to Python bindings. + + In Python builds *without* free-threading, this class does nothing. It has + no attributes and the :cpp:func:`lock` and :cpp:func:`unlock` functions + return immediately. + + .. cpp:function:: ft_mutex() + + Create a new (unlocked) mutex. + + .. cpp:function:: void lock() + + Acquire the mutex. + + .. cpp:function:: void unlock() + + Release the mutex. + +.. cpp:struct:: ft_lock_guard + + This class provides a RAII lock guard analogous to ``std::lock_guard`` and + ``std::unique_lock``. + + .. cpp:function:: ft_lock_guard(ft_mutex &mutex) + + Call :cpp:func:`mutex.lock() ` (no-op in non-free-threaded builds). + + .. cpp:function:: ~ft_lock_guard() + + Call :cpp:func:`mutex.unlock() ` (no-op in non-free-threaded builds). + +.. cpp:struct:: ft_object_guard + + This class provides a RAII guard that locks a single Python object within a + local scope (in contrast to :cpp:class:`ft_lock_guard`, which locks a + mutex). + + It is a thin wrapper around the Python `critical section API + `__. + Please refer to the Python documentation for details on the semantics of + this relaxed form of critical section (in particular, Python critical sections + may release previously held locks). + + In Python builds *without* free-threading, this class does nothing---the + constructor and destructor return immediately. + + .. cpp:function:: ft_object_guard(handle h) + + Lock the object ``h`` (no-op in non-free-threaded builds) + + .. cpp:function:: ~ft_object_guard() + + Unlock the object ``h`` (no-op in non-free-threaded builds) + +.. cpp:struct:: ft_object2_guard + + This class provides a RAII guard that locks *two* Python object within a + local scope (in contrast to :cpp:class:`ft_lock_guard`, which locks a + mutex). + + It is a thin wrapper around the Python `critical section API + `__. + Please refer to the Python documentation for details on the semantics of + this relaxed form of critical section (in particular, Python critical sections + may release previously held locks). + + In Python builds *without* free-threading, this class does nothing---the + constructor and destructor return immediately. + + .. cpp:function:: ft_object2_guard(handle h1, handle h2) + + Lock the objects ``h1`` and ``h2`` (no-op in non-free-threaded builds) + + .. cpp:function:: ~ft_object2_guard() + + Unlock the objects ``h1`` and ``h2`` (no-op in non-free-threaded builds) + Low-level type and instance access ---------------------------------- diff --git a/include/nanobind/nb_misc.h b/include/nanobind/nb_misc.h index bd167d0be..50028c09d 100644 --- a/include/nanobind/nb_misc.h +++ b/include/nanobind/nb_misc.h @@ -9,28 +9,82 @@ NAMESPACE_BEGIN(NB_NAMESPACE) +#define NB_NONCOPYABLE(X) \ + X(const X &) = delete; \ + X &operator=(const X &) = delete; + struct gil_scoped_acquire { public: + NB_NONCOPYABLE(gil_scoped_acquire) gil_scoped_acquire() noexcept : state(PyGILState_Ensure()) { } ~gil_scoped_acquire() { PyGILState_Release(state); } - gil_scoped_acquire(const gil_scoped_acquire &) = delete; - gil_scoped_acquire& operator=(const gil_scoped_acquire &) = delete; private: const PyGILState_STATE state; }; -class gil_scoped_release { +struct gil_scoped_release { public: + NB_NONCOPYABLE(gil_scoped_release) gil_scoped_release() noexcept : state(PyEval_SaveThread()) { } ~gil_scoped_release() { PyEval_RestoreThread(state); } - gil_scoped_release(const gil_scoped_release &) = delete; - gil_scoped_release& operator=(const gil_scoped_release &) = delete; private: PyThreadState *state; }; +struct ft_mutex { +public: + NB_NONCOPYABLE(ft_mutex) + ft_mutex() = default; + +#if !defined(NB_FREE_THREADED) + void lock() { } + void unlock() { } +#else + void lock() { PyMutex_Lock(&mutex); } + void unlock() { PyMutex_Unlock(&mutex); } +private: + PyMutex mutex { 0 }; +#endif +}; + +struct ft_lock_guard { +public: + NB_NONCOPYABLE(ft_lock_guard) + ft_lock_guard(ft_mutex &m) : m(m) { m.lock(); } + ~ft_lock_guard() { m.unlock(); } +private: + ft_mutex &m; +}; + + +struct ft_object_guard { +public: + NB_NONCOPYABLE(ft_object_guard) +#if !defined(NB_FREE_THREADED) + ft_object_guard(handle) { } +#else + ft_object_guard(handle h) { PyCriticalSection_Begin(&cs, h.ptr()); } + ~ft_object_guard() { PyCriticalSection_End(&cs); } +private: + PyCriticalSection cs; +#endif +}; + +struct ft_object2_guard { +public: + NB_NONCOPYABLE(ft_object2_guard) +#if !defined(NB_FREE_THREADED) + ft_object2_guard(handle, handle) { } +#else + ft_object2_guard(handle h1, handle h2) { PyCriticalSection2_Begin(&cs, h1.ptr(), h2.ptr()); } + ~ft_object2_guard() { PyCriticalSection2_End(&cs); } +private: + PyCriticalSection2 cs; +#endif +}; + inline bool leak_warnings() noexcept { return detail::leak_warnings(); } From 510e3e6d6fccb041407d9fb0f5ebded7f6ca68ba Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:16:00 +0900 Subject: [PATCH 05/17] free-threading: GC adaptations The garbage collector in free-threaded Python does not like it when the reference count of an object temporarily increases while traversing the object graph in ``tp_traverse``, and doing so introduces leaks. Unfortunately, example implementations of ``tp_traverse`` in both documentation and test suite fall into this trap and must be adapted. --- docs/typeslots.rst | 21 ++++++++++++++++++++- tests/test_classes.cpp | 2 +- tests/test_stl.cpp | 2 +- tests/test_stl.py | 18 +++++++++++++++--- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/docs/typeslots.rst b/docs/typeslots.rst index ff00b9c81..34f67df42 100644 --- a/docs/typeslots.rst +++ b/docs/typeslots.rst @@ -122,7 +122,7 @@ reference cycles: // If w->value has an associated CPython object, return it. // If not, value.ptr() will equal NULL, which is also fine. - nb::object value = nb::find(w->value); + nb::handle value = nb::find(w->value); // Inform the Python GC about the instance (if non-NULL) Py_VISIT(value.ptr()); @@ -149,6 +149,25 @@ reference cycles: The type ``visitproc`` and macro ``Py_VISIT()`` are part of the Python C API. +.. note:: + + When targeting free-threaded Python, it is important that the ``tp_traverse`` + callback does not hold additional references to the objects being traversed. + + A previous version of this documentation page suggested the following + + .. code-block:: cpp + + nb::object value = nb::find(w->value); + Py_VISIT(value.ptr()); + + However, these now have to change to + + .. code-block:: cpp + + nb::handle value = nb::find(w->value); + Py_VISIT(value.ptr()); + The expression :cpp:func:`nb::inst_ptr\(self) ` efficiently returns the C++ instance associated with a Python object and is explained in the documentation about nanobind's :cpp:ref:`low level interface `. diff --git a/tests/test_classes.cpp b/tests/test_classes.cpp index 22ff2ac81..a75da0fc5 100644 --- a/tests/test_classes.cpp +++ b/tests/test_classes.cpp @@ -122,7 +122,7 @@ int wrapper_tp_traverse(PyObject *self, visitproc visit, void *arg) { Wrapper *w = nb::inst_ptr(self); // If c->value corresponds to an associated CPython object, return it - nb::object value = nb::find(w->value); + nb::handle value = nb::find(w->value); // Inform the Python GC about it Py_VISIT(value.ptr()); diff --git a/tests/test_stl.cpp b/tests/test_stl.cpp index 9dd97a968..bd75db5ca 100644 --- a/tests/test_stl.cpp +++ b/tests/test_stl.cpp @@ -67,7 +67,7 @@ struct FuncWrapper { int funcwrapper_tp_traverse(PyObject *self, visitproc visit, void *arg) { FuncWrapper *w = nb::inst_ptr(self); - nb::object f = nb::cast(w->f, nb::rv_policy::none); + nb::handle f = nb::cast(w->f, nb::rv_policy::none); Py_VISIT(f.ptr()); #if PY_VERSION_HEX >= 0x03090000 diff --git a/tests/test_stl.py b/tests/test_stl.py index 008835cd8..bb2d4e1ec 100644 --- a/tests/test_stl.py +++ b/tests/test_stl.py @@ -306,6 +306,7 @@ def f(): # cpyext reference cycles are not supported, see https://foss.heptapod.net/pypy/pypy/-/issues/3849 @skip_on_pypy def test32_std_function_gc(): + # Test class -> function -> class cyclic reference class A(t.FuncWrapper): pass @@ -316,18 +317,29 @@ class A(t.FuncWrapper): collect() assert t.FuncWrapper.alive == 0 - # A class -> function -> class cyclic reference - class B(t.FuncWrapper): + # t.FuncWrapper is a C extension type with a custom property 'f', which + # can store Python function objects It implements the tp_traverse + # callback so that reference cycles arising from this function object + # can be detected. + + class Test(t.FuncWrapper): def __init__(self): super().__init__() + # The constructor creates a closure, which references 'self' + # and assigns it to the 'self.f' member. + # This creates a cycle self -> self.f -> self def f(): print(self.f) self.f = f + + # The Test class declared above inherits from 'FuncWrapper'. + # This class keeps track of how many references are alive at + # any point to help track down leak issues. assert t.FuncWrapper.alive == 0 - b = B() + b = Test() assert t.FuncWrapper.alive == 1 del b collect() From e02db2d1f2c882af88d5f5b5675321e75c6c8876 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:21:32 +0900 Subject: [PATCH 06/17] free-threading: Locking for internal data structures This commit enables free-threaded extension builds on Python 3.13+, which involves the following changes: - nanobind must notify Python that an extension supports free-threading. - All internal data structures must be protected from concurrent modification. The approach taken varies with respect to the specific data structure, and a long comment in ``nb_internals.h`` explains the design decisions all of the changes. In general, the implementation avoids centralized locks as much as possible to improve scalability. - Adopting safe versions of certain operations where needed, e.g. ``PyList_GetItemRef()``. - Switching non-object allocation from ``PyObject_Allo()`` to ``PyMem_Alloc()``. --- include/nanobind/nb_accessor.h | 21 +- include/nanobind/stl/detail/nb_dict.h | 2 + src/common.cpp | 31 +-- src/error.cpp | 9 +- src/implicit.cpp | 16 +- src/nb_enum.cpp | 33 +++- src/nb_func.cpp | 75 ++++--- src/nb_internals.cpp | 120 ++++++++---- src/nb_internals.h | 171 ++++++++++++++-- src/nb_ndarray.cpp | 10 +- src/nb_static_property.cpp | 24 ++- src/nb_type.cpp | 269 +++++++++++++++++--------- src/trampoline.cpp | 24 ++- 13 files changed, 588 insertions(+), 217 deletions(-) diff --git a/include/nanobind/nb_accessor.h b/include/nanobind/nb_accessor.h index f3a04d45c..f690360df 100644 --- a/include/nanobind/nb_accessor.h +++ b/include/nanobind/nb_accessor.h @@ -133,21 +133,30 @@ struct num_item { }; struct num_item_list { - static constexpr bool cache_dec_ref = false; + #if defined(Py_GIL_DISABLED) + static constexpr bool cache_dec_ref = true; + #else + static constexpr bool cache_dec_ref = false; + #endif + using key_type = Py_ssize_t; NB_INLINE static void get(PyObject *obj, Py_ssize_t index, PyObject **cache) { - *cache = NB_LIST_GET_ITEM(obj, index); + #if defined(Py_GIL_DISABLED) + *cache = PyList_GetItemRef(obj, index); + #else + *cache = NB_LIST_GET_ITEM(obj, index); + #endif } NB_INLINE static void set(PyObject *obj, Py_ssize_t index, PyObject *v) { -#if !defined(Py_LIMITED_API) - // Handle differences between PyList_SetItem and PyList_SET_ITEM +#if defined(Py_LIMITED_API) || defined(NB_FREE_THREADED) + Py_INCREF(v); + PyList_SetItem(obj, index, v); +#else PyObject *old = NB_LIST_GET_ITEM(obj, index); -#endif Py_INCREF(v); NB_LIST_SET_ITEM(obj, index, v); -#if !defined(Py_LIMITED_API) Py_DECREF(old); #endif } diff --git a/include/nanobind/stl/detail/nb_dict.h b/include/nanobind/stl/detail/nb_dict.h index eb0e2deaf..24f77ea21 100644 --- a/include/nanobind/stl/detail/nb_dict.h +++ b/include/nanobind/stl/detail/nb_dict.h @@ -32,6 +32,8 @@ template struct dict_caster { return false; } + // 'items' is safe to access without locking and reference counting, it + // is unique to this thread Py_ssize_t size = NB_LIST_GET_SIZE(items); bool success = size >= 0; diff --git a/src/common.cpp b/src/common.cpp index 67db5a830..a44a807c5 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -143,6 +143,11 @@ PyObject *module_new(const char *name, PyModuleDef *def) noexcept { def->m_name = name; def->m_size = -1; PyObject *m = PyModule_Create(def); + + #ifdef NB_FREE_THREADED + PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED); + #endif + check(m, "nanobind::detail::module_new(): allocation failed!"); return m; } @@ -669,13 +674,15 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep still trigger a segfault if dereferenced. */ if (size == 0) result = (PyObject **) 1; +# if !defined(NB_FREE_THREADED) // Require immutable holder in free-threaded mode } else if (PyList_CheckExact(seq)) { size = (size_t) PyList_GET_SIZE(seq); result = ((PyListObject *) seq)->ob_item; if (size == 0) // ditto result = (PyObject **) 1; +# endif } else if (PySequence_Check(seq)) { - temp = PySequence_Fast(seq, ""); + temp = PySequence_Tuple(seq); if (temp) result = seq_get(temp, &size, temp_out); @@ -689,8 +696,8 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep Py_ssize_t size_seq = PySequence_Length(seq); if (size_seq >= 0) { - result = (PyObject **) PyObject_Malloc(sizeof(PyObject *) * - (size_seq + 1)); + result = (PyObject **) PyMem_Malloc(sizeof(PyObject *) * + (size_seq + 1)); if (result) { result[size_seq] = nullptr; @@ -704,7 +711,7 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep for (Py_ssize_t j = 0; j < i; ++j) Py_DECREF(result[j]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; break; } @@ -716,7 +723,7 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep PyObject **ptr = (PyObject **) PyCapsule_GetPointer(o, nullptr); for (size_t i = 0; ptr[i] != nullptr; ++i) Py_DECREF(ptr[i]); - PyObject_Free(ptr); + PyMem_Free(ptr); }); if (temp) { @@ -726,7 +733,7 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep for (Py_ssize_t i = 0; i < size_seq; ++i) Py_DECREF(result[i]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; } } @@ -763,14 +770,16 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, if (size == 0) result = (PyObject **) 1; } +# if !defined(NB_FREE_THREADED) // Require immutable holder in free-threaded mode } else if (PyList_CheckExact(seq)) { if (size == (size_t) PyList_GET_SIZE(seq)) { result = ((PyListObject *) seq)->ob_item; if (size == 0) // ditto result = (PyObject **) 1; } +# endif } else if (PySequence_Check(seq)) { - temp = PySequence_Fast(seq, ""); + temp = PySequence_Tuple(seq); if (temp) result = seq_get_with_size(temp, size, temp_out); @@ -785,7 +794,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, if (size == (size_t) size_seq) { result = - (PyObject **) PyObject_Malloc(sizeof(PyObject *) * (size + 1)); + (PyObject **) PyMem_Malloc(sizeof(PyObject *) * (size + 1)); if (result) { result[size] = nullptr; @@ -799,7 +808,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, for (Py_ssize_t j = 0; j < i; ++j) Py_DECREF(result[j]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; break; } @@ -811,7 +820,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, PyObject **ptr = (PyObject **) PyCapsule_GetPointer(o, nullptr); for (size_t i = 0; ptr[i] != nullptr; ++i) Py_DECREF(ptr[i]); - PyObject_Free(ptr); + PyMem_Free(ptr); }); if (!temp) { @@ -819,7 +828,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, for (Py_ssize_t i = 0; i < size_seq; ++i) Py_DECREF(result[i]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; } } diff --git a/src/error.cpp b/src/error.cpp index 1a36fe853..5d1d6666e 100644 --- a/src/error.cpp +++ b/src/error.cpp @@ -15,6 +15,7 @@ NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) +// Protected by internals->mutex in free-threaded builds Buffer buf(128); NAMESPACE_END(detail) @@ -116,13 +117,15 @@ void python_error::restore() noexcept { #endif const char *python_error::what() const noexcept { - using detail::buf; + using namespace nanobind::detail; // Return the existing error message if already computed once if (m_what) return m_what; gil_scoped_acquire acq; + // 'buf' is protected by internals->mutex in free-threaded builds + lock_internals guard(internals); // Try again with GIL held if (m_what) @@ -147,7 +150,7 @@ const char *python_error::what() const noexcept { #if defined(Py_LIMITED_API) || defined(PYPY_VERSION) object mod = module_::import_("traceback"), result = mod.attr("format_exception")(exc_type, exc_value, exc_traceback); - m_what = detail::strdup_check(borrow(str("\n").attr("join")(result)).c_str()); + m_what = strdup_check(borrow(str("\n").attr("join")(result)).c_str()); #else buf.clear(); if (exc_traceback.is_valid()) { @@ -160,7 +163,7 @@ const char *python_error::what() const noexcept { PyFrameObject *frame = to->tb_frame; Py_XINCREF(frame); - std::vector> frames; + std::vector> frames; while (frame) { frames.push_back(frame); diff --git a/src/implicit.cpp b/src/implicit.cpp index 721ddbc80..10702a06a 100644 --- a/src/implicit.cpp +++ b/src/implicit.cpp @@ -15,10 +15,12 @@ NAMESPACE_BEGIN(detail) void implicitly_convertible(const std::type_info *src, const std::type_info *dst) noexcept { - type_data *t = nb_type_c2p(internals, dst); + nb_internals *internals_ = internals; + type_data *t = nb_type_c2p(internals_, dst); check(t, "nanobind::detail::implicitly_convertible(src=%s, dst=%s): " "destination type unknown!", type_name(src), type_name(dst)); + lock_internals guard(internals_); size_t size = 0; if (t->flags & (uint32_t) type_flags::has_implicit_conversions) { @@ -30,23 +32,25 @@ void implicitly_convertible(const std::type_info *src, t->flags |= (uint32_t) type_flags::has_implicit_conversions; } - void **data = (void **) malloc(sizeof(void *) * (size + 2)); + void **data = (void **) PyMem_Malloc(sizeof(void *) * (size + 2)); if (size) memcpy(data, t->implicit.cpp, size * sizeof(void *)); data[size] = (void *) src; data[size + 1] = nullptr; - free(t->implicit.cpp); + PyMem_Free(t->implicit.cpp); t->implicit.cpp = (decltype(t->implicit.cpp)) data; } void implicitly_convertible(bool (*predicate)(PyTypeObject *, PyObject *, cleanup_list *), const std::type_info *dst) noexcept { - type_data *t = nb_type_c2p(internals, dst); + nb_internals *internals_ = internals; + type_data *t = nb_type_c2p(internals_, dst); check(t, "nanobind::detail::implicitly_convertible(src=, dst=%s): " "destination type unknown!", type_name(dst)); + lock_internals guard(internals_); size_t size = 0; if (t->flags & (uint32_t) type_flags::has_implicit_conversions) { @@ -58,12 +62,12 @@ void implicitly_convertible(bool (*predicate)(PyTypeObject *, PyObject *, t->flags |= (uint32_t) type_flags::has_implicit_conversions; } - void **data = (void **) malloc(sizeof(void *) * (size + 2)); + void **data = (void **) PyMem_Malloc(sizeof(void *) * (size + 2)); if (size) memcpy(data, t->implicit.py, size * sizeof(void *)); data[size] = (void *) predicate; data[size + 1] = nullptr; - free(t->implicit.py); + PyMem_Free(t->implicit.py); t->implicit.py = (decltype(t->implicit.py)) data; } diff --git a/src/nb_enum.cpp b/src/nb_enum.cpp index 47c344b8d..87ef899c8 100644 --- a/src/nb_enum.cpp +++ b/src/nb_enum.cpp @@ -17,12 +17,21 @@ using enum_map = tsl::robin_map; PyObject *enum_create(enum_init_data *ed) noexcept { // Update hash table that maps from std::type_info to Python type - auto [it, success] = internals->type_c2p_slow.try_emplace(ed->type, nullptr); - if (!success) { - PyErr_WarnFormat(PyExc_RuntimeWarning, 1, "nanobind: type '%s' was already registered!\n", ed->name); - PyObject *tp = (PyObject *) it->second->type_py; - Py_INCREF(tp); - return tp; + nb_internals *internals_ = internals; + bool success; + nb_type_map_slow::iterator it; + + { + lock_internals guard(internals_); + std::tie(it, success) = internals->type_c2p_slow.try_emplace(ed->type, nullptr); + if (!success) { + PyErr_WarnFormat(PyExc_RuntimeWarning, 1, + "nanobind: type '%s' was already registered!\n", + ed->name); + PyObject *tp = (PyObject *) it->second->type_py; + Py_INCREF(tp); + return tp; + } } handle scope(ed->scope); @@ -77,15 +86,21 @@ PyObject *enum_create(enum_init_data *ed) noexcept { it.value() = t; - internals->type_c2p_fast[ed->type] = t; - internals->type_c2p_slow[ed->type] = t; + { + lock_internals guard(internals_); + internals_->type_c2p_slow[ed->type] = t; + + #if !defined(NB_FREE_THREADED) + internals_->type_c2p_fast[ed->type] = t; + #endif + } result.attr("__nb_enum__") = capsule(t, [](void *p) noexcept { type_init_data *t = (type_init_data *) p; delete (enum_map *) t->enum_tbl.fwd; delete (enum_map *) t->enum_tbl.rev; nb_type_unregister(t); - free((char*)t->name); + free((char*) t->name); delete t; }); diff --git a/src/nb_func.cpp b/src/nb_func.cpp index 0965e6a9b..2571d006c 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -82,11 +82,15 @@ void nb_func_dealloc(PyObject *self) { func_data *f = nb_func_data(self); // Delete from registered function list - size_t n_deleted = internals->funcs.erase(self); - check(n_deleted == 1, - "nanobind::detail::nb_func_dealloc(\"%s\"): function not found!", - ((f->flags & (uint32_t) func_flags::has_name) ? f->name + nb_internals *internals_ = internals; + { + lock_internals guard(internals_); + size_t n_deleted = internals_->funcs.erase(self); + check(n_deleted == 1, + "nanobind::detail::nb_func_dealloc(\"%s\"): function not found!", + ((f->flags & (uint32_t) func_flags::has_name) ? f->name : "")); + } for (size_t i = 0; i < size; ++i) { if (f->flags & (uint32_t) func_flags::has_free) f->free_capture(f->capture); @@ -219,14 +223,15 @@ PyObject *nb_func_new(const void *in_) noexcept { } // Check for previous overloads + nb_internals *internals_ = internals; if (has_scope && has_name) { name = PyUnicode_InternFromString(name_cstr); check(name, "nb::detail::nb_func_new(\"%s\"): invalid name.", name_cstr); func_prev = PyObject_GetAttr(f->scope, name); if (func_prev) { - if (Py_TYPE(func_prev) == internals->nb_func || - Py_TYPE(func_prev) == internals->nb_method) { + if (Py_TYPE(func_prev) == internals_->nb_func || + Py_TYPE(func_prev) == internals_->nb_method) { func_data *fp = nb_func_data(func_prev); check((fp->flags & (uint32_t) func_flags::is_method) == @@ -279,7 +284,7 @@ PyObject *nb_func_new(const void *in_) noexcept { // Create a new function and destroy the old one Py_ssize_t to_copy = func_prev ? Py_SIZE(func_prev) : 0; nb_func *func = (nb_func *) PyType_GenericAlloc( - is_method ? internals->nb_method : internals->nb_func, to_copy + 1); + is_method ? internals_->nb_method : internals_->nb_func, to_copy + 1); check(func, "nb::detail::nb_func_new(\"%s\"): alloc. failed (1).", name_cstr); @@ -299,7 +304,8 @@ PyObject *nb_func_new(const void *in_) noexcept { ((PyVarObject *) func_prev)->ob_size = 0; - size_t n_deleted = internals->funcs.erase(func_prev); + lock_internals guard(internals_); + size_t n_deleted = internals_->funcs.erase(func_prev); check(n_deleted == 1, "nanobind::detail::nb_func_new(): internal update failed (1)!"); @@ -312,16 +318,20 @@ PyObject *nb_func_new(const void *in_) noexcept { : nb_func_vectorcall_simple; // Register the function - auto [it, success] = internals->funcs.try_emplace(func, nullptr); - check(success, - "nanobind::detail::nb_func_new(): internal update failed (2)!"); + nb_ptr_map::iterator it; + { + lock_internals guard(internals_); + bool success; + std::tie(it, success) = internals_->funcs.try_emplace(func, nullptr); + check(success, + "nanobind::detail::nb_func_new(): internal update failed (2)!"); + } func_data *fc = nb_func_data(func) + to_copy; memcpy(fc, f, sizeof(func_data_prelim<0>)); if (has_doc) { - if (fc->doc[0] == '\n') { + if (fc->doc[0] == '\n') fc->doc++; - } fc->doc = strdup_check(fc->doc); } @@ -427,6 +437,9 @@ nb_func_error_overload(PyObject *self, PyObject *const *args_in, if (f->flags & (uint32_t) func_flags::is_operator) return not_implemented().release().ptr(); + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); buf.put_dstr(f->name); buf.put("(): incompatible function arguments. The following argument types " @@ -486,6 +499,10 @@ static NB_NOINLINE PyObject *nb_func_error_noconvert(PyObject *self, if (PyErr_Occurred()) return nullptr; func_data *f = nb_func_data(self); + + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); buf.put("Unable to convert function return value to a Python " "type! The signature was\n "); @@ -619,7 +636,7 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, until we get a result other than NB_NEXT_OVERLOAD. */ - for (int pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { + for (size_t pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { for (size_t k = 0; k < count; ++k) { const func_data *f = fr + k; @@ -961,8 +978,8 @@ PyObject *nb_method_descr_get(PyObject *self, PyObject *inst, PyObject *) { } } - -/// Render the function signature of a single function +/// Render the function signature of a single function. Callers must hold the +/// 'internals' mutex. static uint32_t nb_func_render_signature(const func_data *f, bool nb_signature_mode) noexcept { const bool is_method = f->flags & (uint32_t) func_flags::is_method, @@ -971,6 +988,7 @@ static uint32_t nb_func_render_signature(const func_data *f, has_var_kwargs = f->flags & (uint32_t) func_flags::has_var_kwargs, has_signature = f->flags & (uint32_t) func_flags::has_signature; + nb_internals *internals_ = internals; if (has_signature) { const char *s = f->signature; @@ -1086,7 +1104,7 @@ static uint32_t nb_func_render_signature(const func_data *f, } if (f->args[arg_index].value) { - const arg_data &arg= f->args[arg_index]; + const arg_data &arg = f->args[arg_index]; if (nb_signature_mode) { buf.put(" = \\"); if (arg.signature) @@ -1096,8 +1114,12 @@ static uint32_t nb_func_render_signature(const func_data *f, buf.put(" = "); buf.put_dstr(arg.signature); } else { - PyObject *o = arg.value; - PyObject *str = PyObject_Repr(o); + PyObject *o = arg.value, *str; + + { + unlock_internals guard2(internals_); + str = PyObject_Repr(o); + } if (str) { Py_ssize_t size = 0; @@ -1129,14 +1151,17 @@ static uint32_t nb_func_render_signature(const func_data *f, "nb::detail::nb_func_render_signature(): missing type!"); if (!(is_method && arg_index == 0)) { - auto it = internals->type_c2p_slow.find(*descr_type); + bool found = false; + auto it = internals_->type_c2p_slow.find(*descr_type); - if (it != internals->type_c2p_slow.end()) { + if (it != internals_->type_c2p_slow.end()) { handle th((PyObject *) it->second->type_py); buf.put_dstr((borrow(th.attr("__module__"))).c_str()); buf.put('.'); buf.put_dstr((borrow(th.attr("__qualname__"))).c_str()); - } else { + found = true; + } + if (!found) { if (nb_signature_mode) buf.put('"'); char *name = type_name(*descr_type); @@ -1227,6 +1252,9 @@ PyObject *nb_func_get_nb_signature(PyObject *self, void *) { Py_INCREF(docstr); } + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); uint32_t n_default_args = nb_func_render_signature(fi, true); @@ -1285,6 +1313,9 @@ PyObject *nb_func_get_doc(PyObject *self, void *) { func_data *f = nb_func_data(self); uint32_t count = (uint32_t) Py_SIZE(self); + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); size_t doc_count = 0; diff --git a/src/nb_internals.cpp b/src/nb_internals.cpp index cb032a56d..68ce3853a 100644 --- a/src/nb_internals.cpp +++ b/src/nb_internals.cpp @@ -10,6 +10,7 @@ #include #include #include "nb_internals.h" +#include #if defined(__GNUC__) && !defined(__clang__) # pragma GCC diagnostic ignored "-Wmissing-field-initializers" @@ -232,6 +233,7 @@ void default_exception_translator(const std::exception_ptr &p, void *) { } } +// Initialized once when the module is loaded, no locking needed nb_internals *internals = nullptr; PyTypeObject *nb_meta_cache = nullptr; @@ -240,62 +242,71 @@ static bool *is_alive_ptr = &is_alive_value; bool is_alive() noexcept { return *is_alive_ptr; } static void internals_cleanup() { - if (!internals) + nb_internals *p = internals; + if (!p) return; *is_alive_ptr = false; -#if !defined(PYPY_VERSION) +#if !defined(PYPY_VERSION) && !defined(NB_FREE_THREADED) /* The memory leak checker is unsupported on PyPy, see - see https://foss.heptapod.net/pypy/pypy/-/issues/3855 */ + see https://foss.heptapod.net/pypy/pypy/-/issues/3855 - bool leak = false, print_leak_warnings = internals->print_leak_warnings; + It is explicitly disabled on free-threaded builds, which + immortalize function and type objects. + */ - if (!internals->inst_c2p.empty()) { - if (print_leak_warnings) { - fprintf(stderr, "nanobind: leaked %zu instances!\n", - internals->inst_c2p.size()); - #if !defined(Py_LIMITED_API) - auto print_leak = [](void* k, PyObject* v) { - PyTypeObject *tp = Py_TYPE(v); - fprintf(stderr, " - leaked instance %p of type \"%s\"\n", k, tp->tp_name); - }; - - for (auto [k, v]: internals->inst_c2p) { - if (NB_UNLIKELY(nb_is_seq(v))) { - nb_inst_seq* seq = nb_get_seq(v); - for(; seq != nullptr; seq = seq->next) - print_leak(k, seq->inst); - } else { - print_leak(k, (PyObject*)v); - } - } - #endif - } - leak = true; + bool print_leak_warnings = p->print_leak_warnings; + size_t inst_leaks = 0, keep_alive_leaks = 0; + + // Shard locking no longer needed, Py_AtExit is single-threaded + for (size_t i = 0; i < p->shard_count; ++i) { + nb_shard &s = p->shards[i]; + inst_leaks += s.inst_c2p.size(); + keep_alive_leaks += s.keep_alive.size(); } - if (!internals->keep_alive.empty()) { - if (print_leak_warnings) { - fprintf(stderr, "nanobind: leaked %zu keep_alive records!\n", - internals->keep_alive.size()); + bool leak = inst_leaks > 0 || keep_alive_leaks > 0; + + if (print_leak_warnings && inst_leaks > 0) { + fprintf(stderr, "nanobind: leaked %zu instances!\n", inst_leaks); + +#if !defined(Py_LIMITED_API) + auto print_leak = [](void* k, PyObject* v) { + type_data *tp = nb_type_data(Py_TYPE(v)); + fprintf(stderr, " - leaked instance %p of type \"%s\"\n", k, tp->name); + }; + + for (size_t i = 0; i < p->shard_count; ++i) { + for (auto [k, v]: p->shards[i].inst_c2p) { + if (NB_UNLIKELY(nb_is_seq(v))) { + nb_inst_seq* seq = nb_get_seq(v); + for(; seq != nullptr; seq = seq->next) + print_leak(k, seq->inst); + } else { + print_leak(k, (PyObject*)v); + } + } } - leak = true; +#endif } + if (print_leak_warnings && keep_alive_leaks > 0) + fprintf(stderr, "nanobind: leaked %zu keep_alive records!\n", + keep_alive_leaks); + // Only report function/type leaks if actual nanobind instances were leaked #if !defined(NB_ABORT_ON_LEAK) if (!leak) print_leak_warnings = false; #endif - if (!internals->type_c2p_slow.empty() || - !internals->type_c2p_fast.empty()) { + if (!p->type_c2p_slow.empty()) { if (print_leak_warnings) { fprintf(stderr, "nanobind: leaked %zu types!\n", - internals->type_c2p_slow.size()); + p->type_c2p_slow.size()); int ctr = 0; - for (const auto &kv : internals->type_c2p_slow) { + for (const auto &kv : p->type_c2p_slow) { fprintf(stderr, " - leaked type \"%s\"\n", kv.second->name); if (ctr++ == 10) { fprintf(stderr, " - ... skipped remainder\n"); @@ -306,12 +317,12 @@ static void internals_cleanup() { leak = true; } - if (!internals->funcs.empty()) { + if (!p->funcs.empty()) { if (print_leak_warnings) { fprintf(stderr, "nanobind: leaked %zu functions!\n", - internals->funcs.size()); + p->funcs.size()); int ctr = 0; - for (auto [f, p] : internals->funcs) { + for (auto [f, p] : p->funcs) { fprintf(stderr, " - leaked function \"%s\"\n", nb_func_data(f)->name); if (ctr++ == 10) { @@ -324,13 +335,19 @@ static void internals_cleanup() { } if (!leak) { - nb_translator_seq* t = internals->translators.next; + nb_translator_seq* t = p->translators.next; while (t) { nb_translator_seq *next = t->next; delete t; t = next; } - delete internals; + +#if defined(NB_FREE_THREADED) + PyThread_tss_delete(p->nb_static_property_disabled); + PyThread_tss_free(p->nb_static_property_disabled); +#endif + + delete p; internals = nullptr; nb_meta_cache = nullptr; } else { @@ -376,6 +393,17 @@ NB_NOINLINE void init(const char *name) { nb_internals *p = new nb_internals(); + size_t shard_count = 1; +#if defined(NB_FREE_THREADED) + size_t hw_concurrency = std::thread::hardware_concurrency(); + while (shard_count < hw_concurrency) + shard_count *= 2; + shard_count *= 2; + p->shards = new nb_shard[shard_count]; + p->shard_mask = shard_count - 1; +#endif + p->shard_count = shard_count; + str nb_name("nanobind"); p->nb_module = PyModule_NewObject(nb_name.ptr()); @@ -385,8 +413,16 @@ NB_NOINLINE void init(const char *name) { p->nb_func = (PyTypeObject *) PyType_FromSpec(&nb_func_spec); p->nb_method = (PyTypeObject *) PyType_FromSpec(&nb_method_spec); p->nb_bound_method = (PyTypeObject *) PyType_FromSpec(&nb_bound_method_spec); - p->keep_alive.min_load_factor(.1f); - p->inst_c2p.min_load_factor(.1f); + +#if defined(NB_FREE_THREADED) + p->nb_static_property_disabled = PyThread_tss_alloc(); + PyThread_tss_create(p->nb_static_property_disabled); +#endif + + for (size_t i = 0; i < shard_count; ++i) { + p->shards[i].keep_alive.min_load_factor(.1f); + p->shards[i].inst_c2p.min_load_factor(.1f); + } check(p->nb_module && p->nb_meta && p->nb_type_dict && p->nb_func && p->nb_method && p->nb_bound_method, diff --git a/src/nb_internals.h b/src/nb_internals.h index 94693e8cb..712ed1028 100644 --- a/src/nb_internals.h +++ b/src/nb_internals.h @@ -202,6 +202,106 @@ struct nb_translator_seq { nb_translator_seq *next = nullptr; }; +#if defined(NB_FREE_THREADED) +# define NB_SHARD_ALIGNMENT alignas(64) +#else +# define NB_SHARD_ALIGNMENT +#endif + +/** + * The following data structure stores information associated with individual + * instances. In free-threaded builds, it is split into multiple shards to avoid + * lock contention. + */ +struct NB_SHARD_ALIGNMENT nb_shard { + /** + * C++ -> Python instance map + * + * This associative data structure maps a C++ instance pointer onto its + * associated PyObject* (if bit 0 of the map value is zero) or a linked + * list of type `nb_inst_seq*` (if bit 0 is set---it must be cleared before + * interpreting the pointer in this case). + * + * The latter case occurs when several distinct Python objects reference + * the same memory address (e.g. a struct and its first member). + */ + nb_ptr_map inst_c2p; + + /// Dictionary storing keep_alive references + nb_ptr_map keep_alive; + +#if defined(NB_FREE_THREADED) + PyMutex mutex { }; +#endif +}; + +/** + * `nb_internals` is the central data structure storing information related to + * function/type bindings and instances. Separate nanobind extensions within the + * same NB_DOMAIN furthermore share `nb_internals` to communicate with each + * other, hence any changes here generally require an ABI version bump. + * + * The GIL protects the elements of this data structure from concurrent + * modification. In free-threaded builds, a combination of locking schemes is + * needed to achieve good performance. + * + * In particular, `inst_c2p` and `type_c2p_fast` are very hot and potentially + * accessed several times while dispatching a single function call. The other + * elements are accessed much less frequently and easier to deal with. + * + * The following list clarifies locking semantics for each member. + * + * - `nb_module`, `nb_meta`, `nb_func`, `nb_method`, `nb_bound_method`, + * `*_Type_tp_*`, `shard_count`, `is_alive_ptr`: these are initialized when + * loading the first nanobind extension within a domain, which happens within + * a critical section. They do not require locking. + * + * - `nb_type_dict`: created when the loading the first nanobind extension + * within a domain. While the dictionary itself is protected by its own + * lock, additional locking is needed to avoid races that create redundant + * entries. The `mutex` member is used for this. + * + * - `nb_static_property` and `nb_static_propert_descr_set`: created only once + * on demand, protected by `mutex`. + * + * - `nb_static_property_disabled`: needed to correctly implement assignments to + * static properties. Free-threaded builds store this flag using TLS to avoid + * concurrent modification. + * + * - `nb_static_property` and `nb_static_propert_descr_set`: created only once + * on demand, protected by `mutex`. + * + * - `nb_ndarray`: created only once on demand, protected by `mutex`. + * + * - `inst_c2p`: stores the C++ instance to Python object mapping. This + * data struture is *hot* and uses a sharded locking scheme to reduce + * lock contention. + * + * - `keep_alive`: stores lifetime dependencies (e.g., from the + * reference_internal return value policy). This data structure is + * potentially hot and shares the sharding scheme of `inst_c2p`. + * + * - `type_c2p_slow`: This is the ground-truth source of the `std::type_info` + * to `type_info *` mapping. Unrelated to free-threading, lookups into this + * data struture are generally costly because they use a string comparison on + * some platforms. Because it is only used as a fallback for 'type_c2p_fast', + * protecting this member via the global `mutex` is sufficient. + * + * - `type_c2p_fast`: this data structure is *hot* and mostly read. It maps + * `std::type_info` to `type_info *` but uses pointer-based comparisons. + * The implementation depends on the Python build. + * + * - `translators`: This is an append-to-front-only singly linked list traversed + * while raising exceptions. The main concern is losing elements during + * concurrent append operations. We assume that this data structure is only + * written during module initialization and don't use locking. + * + * - `funcs`: data structure for function leak tracking. Protected by `mutex`. + * + * - `print_leak_warnings`, `print_implicit_cast_warnings`: simple boolean + * flags. No protection against concurrent conflicting updates. + */ + struct nb_internals { /// Internal nanobind module PyObject *nb_module; @@ -217,34 +317,41 @@ struct nb_internals { /// Property variant for static attributes (created on demand) PyTypeObject *nb_static_property = nullptr; - bool nb_static_property_enabled = true; descrsetfunc nb_static_property_descr_set = nullptr; +#if defined(NB_FREE_THREADED) + Py_tss_t *nb_static_property_disabled = nullptr; +#else + bool nb_static_property_disabled = false; +#endif + /// N-dimensional array wrapper (created on demand) PyTypeObject *nb_ndarray = nullptr; - /** - * C++ -> Python instance map - * - * This associative data structure maps a C++ instance pointer onto its - * associated PyObject* (if bit 0 of the map value is zero) or a linked - * list of type `nb_inst_seq*` (if bit 0 is set---it must be cleared before - * interpreting the pointer in this case). - * - * The latter case occurs when several distinct Python objects reference - * the same memory address (e.g. a struct and its first member). - */ - nb_ptr_map inst_c2p; +#if defined(NB_FREE_THREADED) + nb_shard *shards = nullptr; + size_t shard_mask = 0; + + // Heuristic shard selection (from pybind11 PR #5148 by @colesbury), uses + // high pointer bits to group allocations by individual threads/cores. + inline nb_shard &shard(void *p) { + uintptr_t highbits = ((uintptr_t) p) >> 20; + size_t index = ((size_t) fmix64((uint64_t) highbits)) & shard_mask; + return shards[index]; + } +#else + nb_shard shards[1]; + inline nb_shard &shard(void *) { return shards[0]; } +#endif +#if !defined(NB_FREE_THREADED) /// C++ -> Python type map -- fast version based on std::type_info pointer equality nb_type_map_fast type_c2p_fast; +#endif /// C++ -> Python type map -- slow fallback version based on hashed strings nb_type_map_slow type_c2p_slow; - /// Dictionary storing keep_alive references - nb_ptr_map keep_alive; - /// nb_func/meth instance map for leak reporting (used as set, the value is unused) nb_ptr_map funcs; @@ -270,6 +377,13 @@ struct nb_internals { descrsetfunc PyProperty_Type_tp_descr_set; size_t type_data_offset; #endif + +#if defined(NB_FREE_THREADED) + PyMutex mutex { }; +#endif + + // Size of the 'shards' data structure. Only rarely accessed, hence at the end + size_t shard_count = 1; }; /// Convenience macro to potentially access cached functions @@ -338,6 +452,31 @@ template struct scoped_pymalloc { T *ptr{ nullptr }; }; + +/// RAII lock/unlock guards for free-threaded builds +#if defined(NB_FREE_THREADED) +struct lock_shard { + nb_shard &s; + lock_shard(nb_shard &s) : s(s) { PyMutex_Lock(&s.mutex); } + ~lock_shard() { PyMutex_Unlock(&s.mutex); } +}; +struct lock_internals { + nb_internals *i; + lock_internals(nb_internals *i) : i(i) { PyMutex_Lock(&i->mutex); } + ~lock_internals() { PyMutex_Unlock(&i->mutex); } +}; +struct unlock_internals { + nb_internals *i; + unlock_internals(nb_internals *i) : i(i) { PyMutex_Unlock(&i->mutex); } + ~unlock_internals() { PyMutex_Lock(&i->mutex); } +}; +#else +struct lock_shard { lock_shard(nb_shard &) { } }; +struct lock_internals { lock_internals(nb_internals *) { } }; +struct unlock_internals { unlock_internals(nb_internals *) { } }; +struct lock_obj { lock_obj(PyObject *) { } }; +#endif + extern char *strdup_check(const char *); extern void *malloc_check(size_t size); diff --git a/src/nb_ndarray.cpp b/src/nb_ndarray.cpp index 2816c2cdf..c8db618d2 100644 --- a/src/nb_ndarray.cpp +++ b/src/nb_ndarray.cpp @@ -174,9 +174,15 @@ static PyMethodDef nb_ndarray_members[] = { }; static PyTypeObject *nd_ndarray_tp() noexcept { - PyTypeObject *tp = internals->nb_ndarray; + nb_internals *internals_ = internals; + PyTypeObject *tp = internals_->nb_ndarray; if (NB_UNLIKELY(!tp)) { + lock_internals guard(internals_); + tp = internals_->nb_ndarray; + if (tp) + return tp; + PyType_Slot slots[] = { { Py_tp_dealloc, (void *) nb_ndarray_dealloc }, { Py_tp_methods, (void *) nb_ndarray_members }, @@ -203,7 +209,7 @@ static PyTypeObject *nd_ndarray_tp() noexcept { tp->tp_as_buffer->bf_releasebuffer = nb_ndarray_releasebuffer; #endif - internals->nb_ndarray = tp; + internals_->nb_ndarray = tp; } return tp; diff --git a/src/nb_static_property.cpp b/src/nb_static_property.cpp index e8474c3ac..0c8bc6399 100644 --- a/src/nb_static_property.cpp +++ b/src/nb_static_property.cpp @@ -5,7 +5,16 @@ NAMESPACE_BEGIN(detail) /// `nb_static_property.__get__()`: Always pass the class instead of the instance. static PyObject *nb_static_property_descr_get(PyObject *self, PyObject *, PyObject *cls) { - if (internals->nb_static_property_enabled) { + + // Flag to avoid infinite recursion during static attribute assignment + bool static_property_disabled; +#if defined(NB_FREE_THREADED) + static_property_disabled = (bool) PyThread_tss_get(internals->nb_static_property_disabled); +#else + static_property_disabled = internals->nb_static_property_disabled; +#endif + + if (!static_property_disabled) { return NB_SLOT(PyProperty_Type, tp_descr_get)(self, cls, cls); } else { Py_INCREF(self); @@ -20,9 +29,16 @@ static int nb_static_property_descr_set(PyObject *self, PyObject *obj, PyObject } PyTypeObject *nb_static_property_tp() noexcept { - PyTypeObject *tp = internals->nb_static_property; + nb_internals *internals_ = internals; + PyTypeObject *tp = internals_->nb_static_property; if (NB_UNLIKELY(!tp)) { + lock_internals guard(internals_); + + tp = internals_->nb_static_property; + if (tp) + return tp; + PyMemberDef *members; #if defined(Py_LIMITED_API) @@ -49,8 +65,8 @@ PyTypeObject *nb_static_property_tp() noexcept { tp = (PyTypeObject *) PyType_FromSpec(&spec); check(tp, "nb_static_property type creation failed!"); - internals->nb_static_property = tp; - internals->nb_static_property_descr_set = nb_static_property_descr_set; + internals_->nb_static_property = tp; + internals_->nb_static_property_descr_set = nb_static_property_descr_set; } return tp; diff --git a/src/nb_type.cpp b/src/nb_type.cpp index 0bce5c28e..340cc3826 100644 --- a/src/nb_type.cpp +++ b/src/nb_type.cpp @@ -100,7 +100,9 @@ PyObject *inst_new_int(PyTypeObject *tp, PyObject * /* args */, self->unused = 0; // Update hash table that maps from C++ to Python instance - auto [it, success] = internals->inst_c2p.try_emplace((void *) payload, self); + nb_shard &shard = internals->shard((void *) payload); + lock_shard guard(shard); + auto [it, success] = shard.inst_c2p.try_emplace((void *) payload, self); check(success, "nanobind::detail::inst_new_int(): unexpected collision!"); } @@ -162,8 +164,11 @@ PyObject *inst_new_ext(PyTypeObject *tp, void *value) { self->intrusive = intrusive; self->unused = 0; + nb_shard &shard = internals->shard(value); + lock_shard guard(shard); + // Update hash table that maps from C++ to Python instance - auto [it, success] = internals->inst_c2p.try_emplace(value, self); + auto [it, success] = shard.inst_c2p.try_emplace(value, self); if (NB_UNLIKELY(!success)) { void *entry = it->second; @@ -243,73 +248,81 @@ static void inst_dealloc(PyObject *self) { operator delete(p, std::align_val_t(t->align)); } - if (NB_UNLIKELY(inst->clear_keep_alive)) { - size_t self_hash = ptr_hash()(self); - nb_ptr_map &keep_alive = internals->keep_alive; - nb_ptr_map::iterator it = keep_alive.find(self, self_hash); - check(it != keep_alive.end(), - "nanobind::detail::inst_dealloc(\"%s\"): inconsistent " - "keep_alive information", t->name); - - nb_weakref_seq *s = (nb_weakref_seq *) it->second; - keep_alive.erase_fast(it); + nb_weakref_seq *wr_seq = nullptr; - do { - nb_weakref_seq *c = s; - s = c->next; + { + // Enter critical section of shard + nb_shard &shard = internals->shard(p); + lock_shard guard(shard); - if (c->callback) - c->callback(c->payload); - else - Py_DECREF((PyObject *) c->payload); + if (NB_UNLIKELY(inst->clear_keep_alive)) { + size_t self_hash = ptr_hash()(self); + nb_ptr_map &keep_alive = shard.keep_alive; + nb_ptr_map::iterator it = keep_alive.find(self, self_hash); + check(it != keep_alive.end(), + "nanobind::detail::inst_dealloc(\"%s\"): inconsistent " + "keep_alive information", t->name); - PyObject_Free(c); - } while (s); - } + wr_seq = (nb_weakref_seq *) it->second; + keep_alive.erase_fast(it); + } - size_t p_hash = ptr_hash()(p); + size_t p_hash = ptr_hash()(p); - // Update hash table that maps from C++ to Python instance - nb_ptr_map &inst_c2p = internals->inst_c2p; - nb_ptr_map::iterator it = inst_c2p.find(p, p_hash); - bool found = false; + // Update hash table that maps from C++ to Python instance + nb_ptr_map &inst_c2p = shard.inst_c2p; + nb_ptr_map::iterator it = inst_c2p.find(p, p_hash); + bool found = false; - if (NB_LIKELY(it != inst_c2p.end())) { - void *entry = it->second; - if (NB_LIKELY(entry == inst)) { - found = true; - inst_c2p.erase_fast(it); - } else if (nb_is_seq(entry)) { - // Multiple objects are associated with this address. Find the right one! - nb_inst_seq *seq = nb_get_seq(entry), - *pred = nullptr; - - do { - if ((nb_inst *) seq->inst == inst) { - found = true; - - if (pred) { - pred->next = seq->next; - } else { - if (seq->next) - it.value() = nb_mark_seq(seq->next); - else - inst_c2p.erase_fast(it); + if (NB_LIKELY(it != inst_c2p.end())) { + void *entry = it->second; + if (NB_LIKELY(entry == inst)) { + found = true; + inst_c2p.erase_fast(it); + } else if (nb_is_seq(entry)) { + // Multiple objects are associated with this address. Find the right one! + nb_inst_seq *seq = nb_get_seq(entry), + *pred = nullptr; + + do { + if ((nb_inst *) seq->inst == inst) { + found = true; + + if (pred) { + pred->next = seq->next; + } else { + if (seq->next) + it.value() = nb_mark_seq(seq->next); + else + inst_c2p.erase_fast(it); + } + + PyMem_Free(seq); + break; } - PyMem_Free(seq); - break; - } - - pred = seq; - seq = seq->next; - } while (seq); + pred = seq; + seq = seq->next; + } while (seq); + } } + + check(found, + "nanobind::detail::inst_dealloc(\"%s\"): attempted to delete an " + "unknown instance (%p)!", t->name, p); } - check(found, - "nanobind::detail::inst_dealloc(\"%s\"): attempted to delete an " - "unknown instance (%p)!", t->name, p); + while (wr_seq) { + nb_weakref_seq *c = wr_seq; + wr_seq = c->next; + + if (c->callback) + c->callback(c->payload); + else + Py_DECREF((PyObject *) c->payload); + + PyMem_Free(c); + } if (NB_UNLIKELY(gc)) PyObject_GC_Del(self); @@ -319,24 +332,37 @@ static void inst_dealloc(PyObject *self) { Py_DECREF(tp); } + type_data *nb_type_c2p(nb_internals *internals_, const std::type_info *type) { +#if defined(NB_FREE_THREADED) + thread_local nb_type_map_fast type_c2p_fast; +#else nb_type_map_fast &type_c2p_fast = internals_->type_c2p_fast; - nb_type_map_slow &type_c2p_slow = internals_->type_c2p_slow; +#endif nb_type_map_fast::iterator it_fast = type_c2p_fast.find(type); if (it_fast != type_c2p_fast.end()) return it_fast->second; + lock_internals guard(internals_); + nb_type_map_slow &type_c2p_slow = internals_->type_c2p_slow; nb_type_map_slow::iterator it_slow = type_c2p_slow.find(type); if (it_slow != type_c2p_slow.end()) { type_data *d = it_slow->second; - nb_alias_chain *chain = (nb_alias_chain *) PyMem_Malloc(sizeof(nb_alias_chain)); +#if !defined(NB_FREE_THREADED) + // Maintain a linked list to clean up 'type_c2p_fast' when the type + // expires (see nb_type_unregister). In free-threaded mode, we leak + // these entries until the thread destructs. + nb_alias_chain *chain = + (nb_alias_chain *) PyMem_Malloc(sizeof(nb_alias_chain)); check(chain, "Could not allocate nb_alias_chain entry!"); chain->next = d->alias_chain; chain->value = type; d->alias_chain = chain; +#endif + type_c2p_fast[type] = d; return d; } @@ -345,11 +371,19 @@ type_data *nb_type_c2p(nb_internals *internals_, } void nb_type_unregister(type_data *t) noexcept { - nb_type_map_slow &type_c2p_slow = internals->type_c2p_slow; - nb_type_map_fast &type_c2p_fast = internals->type_c2p_fast; + nb_internals *internals_ = internals; + nb_type_map_slow &type_c2p_slow = internals_->type_c2p_slow; + + lock_internals guard(internals_); + size_t n_del_slow = type_c2p_slow.erase(t->type); - size_t n_del_slow = type_c2p_slow.erase(t->type), - n_del_fast = type_c2p_fast.erase(t->type); +#if defined(NB_FREE_THREADED) + // In free-threaded mode, stale type information remains in the + // 'type_c2p_fast' TLS. It is freed when the thread terminates. + bool fail = n_del_slow != 1; +#else + nb_type_map_fast &type_c2p_fast = internals_->type_c2p_fast; + size_t n_del_fast = type_c2p_fast.erase(t->type); bool fail = n_del_fast != 1 || n_del_slow != 1; if (!fail) { @@ -365,6 +399,7 @@ void nb_type_unregister(type_data *t) noexcept { cur = next; } } +#endif check(!fail, "nanobind::detail::nb_type_unregister(\"%s\"): could not " @@ -378,8 +413,8 @@ static void nb_type_dealloc(PyObject *o) { nb_type_unregister(t); if (t->flags & (uint32_t) type_flags::has_implicit_conversions) { - free(t->implicit.cpp); - free(t->implicit.py); + PyMem_Free(t->implicit.cpp); + PyMem_Free(t->implicit.py); } free((char *) t->name); @@ -442,11 +477,23 @@ static int nb_type_init(PyObject *self, PyObject *args, PyObject *kwds) { } /// Special case to handle 'Class.property = value' assignments -static int nb_type_setattro(PyObject* obj, PyObject* name, PyObject* value) { +int nb_type_setattro(PyObject* obj, PyObject* name, PyObject* value) { nb_internals *int_p = internals; - int_p->nb_static_property_enabled = false; + + // Set a flag to avoid infinite recursion during static attribute assignment +#if defined(NB_FREE_THREADED) + PyThread_tss_set(int_p->nb_static_property_disabled, (void *) 1); +#else + int_p->nb_static_property_disabled = true; +#endif + PyObject *cur = PyObject_GetAttr(obj, name); - int_p->nb_static_property_enabled = true; + +#if defined(NB_FREE_THREADED) + PyThread_tss_set(int_p->nb_static_property_disabled, (void *) 0); +#else + int_p->nb_static_property_disabled = false; +#endif if (cur) { PyTypeObject *tp = int_p->nb_static_property; @@ -711,6 +758,8 @@ static PyObject *nb_type_from_metaclass(PyTypeObject *meta, PyObject *mod, if (doc && !fail) { size_t size = strlen(doc) + 1; + /// This code path is only used for Python 3.12, where + /// PyObject_Malloc is the right allocation routine for tp_doc char *target = (char *) PyObject_Malloc(size); if (!target) { PyErr_NoMemory(); @@ -756,14 +805,23 @@ static PyObject *nb_type_from_metaclass(PyTypeObject *meta, PyObject *mod, #endif } +extern int nb_type_setattro(PyObject* obj, PyObject* name, PyObject* value); + static PyTypeObject *nb_type_tp(size_t supplement) noexcept { - nb_internals *internals_ = internals; object key = steal(PyLong_FromSize_t(supplement)); + nb_internals *internals_ = internals; PyTypeObject *tp = (PyTypeObject *) PyDict_GetItem(internals_->nb_type_dict, key.ptr()); if (NB_UNLIKELY(!tp)) { + // Retry in critical section to avoid races that create the same nb_type + lock_internals guard(internals_); + + tp = (PyTypeObject *) PyDict_GetItem(internals_->nb_type_dict, key.ptr()); + if (tp) + return tp; + #if defined(Py_LIMITED_API) PyMemberDef members[] = { { "__vectorcalloffset__", Py_T_PYSSIZET, 0, Py_READONLY, nullptr }, @@ -773,6 +831,7 @@ static PyTypeObject *nb_type_tp(size_t supplement) noexcept { // Workaround because __vectorcalloffset__ does not support Py_RELATIVE_OFFSET members[0].offset = internals_->type_data_offset + offsetof(type_data, vectorcall); #endif + PyType_Slot slots[] = { { Py_tp_base, &PyType_Type }, { Py_tp_dealloc, (void *) nb_type_dealloc }, @@ -980,14 +1039,23 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { PyObject *mod = nullptr; // Update hash table that maps from std::type_info to Python type - auto [it, success] = internals->type_c2p_slow.try_emplace(t->type, nullptr); - if (!success) { - PyErr_WarnFormat(PyExc_RuntimeWarning, 1, "nanobind: type '%s' was already registered!\n", t_name); - PyObject *tp = (PyObject *) it->second->type_py; - Py_INCREF(tp); - if (has_signature) - free((char *) t_name); - return tp; + nb_type_map_slow::iterator it; + bool success; + nb_internals *internals_ = internals; + + { + lock_internals guard(internals_); + std::tie(it, success) = internals_->type_c2p_slow.try_emplace(t->type, nullptr); + if (!success) { + PyErr_WarnFormat(PyExc_RuntimeWarning, 1, + "nanobind: type '%s' was already registered!\n", + t_name); + PyObject *tp = (PyObject *) it->second->type_py; + Py_INCREF(tp); + if (has_signature) + free((char *) t_name); + return tp; + } } if (t->scope != nullptr) { @@ -1039,8 +1107,9 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { "nanobind::detail::nb_type_new(\"%s\"): base type is not a " "nanobind type!", t_name); } else if (has_base) { - nb_type_map_slow::iterator it2 = internals->type_c2p_slow.find(t->base); - check(it2 != internals->type_c2p_slow.end(), + lock_internals guard(internals_); + nb_type_map_slow::iterator it2 = internals_->type_c2p_slow.find(t->base); + check(it2 != internals_->type_c2p_slow.end(), "nanobind::detail::nb_type_new(\"%s\"): base type \"%s\" not " "known to nanobind!", t_name, type_name(t->base)); base = (PyObject *) it2->second->type_py; @@ -1251,8 +1320,14 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { if (modname.is_valid()) setattr(result, "__module__", modname.ptr()); - internals->type_c2p_fast[t->type] = to; - internals->type_c2p_slow[t->type] = to; + { + lock_internals guard(internals_); + internals_->type_c2p_slow[t->type] = to; + + #if !defined(NB_FREE_THREADED) + internals_->type_c2p_fast[t->type] = to; + #endif + } if (has_signature) { setattr(result, "__nb_signature__", str(t->name)); @@ -1454,9 +1529,13 @@ void keep_alive(PyObject *nurse, PyObject *patient) { return; if (nb_type_check((PyObject *) Py_TYPE(nurse))) { - nb_weakref_seq **pp = - (nb_weakref_seq **) &internals->keep_alive[nurse]; +#if defined(NB_FREE_THREADED) + nb_shard &shard = internals->shard(inst_ptr((nb_inst *) nurse)); +#else + nb_shard &shard = internals->shards[0]; +#endif + nb_weakref_seq **pp = (nb_weakref_seq **) &shard.keep_alive[nurse]; do { nb_weakref_seq *p = *pp; if (!p) @@ -1467,7 +1546,7 @@ void keep_alive(PyObject *nurse, PyObject *patient) { } while (true); nb_weakref_seq *s = - (nb_weakref_seq *) PyObject_Malloc(sizeof(nb_weakref_seq)); + (nb_weakref_seq *) PyMem_Malloc(sizeof(nb_weakref_seq)); check(s, "nanobind::detail::keep_alive(): out of memory!"); s->payload = patient; @@ -1503,9 +1582,15 @@ void keep_alive(PyObject *nurse, void *payload, check(nurse, "nanobind::detail::keep_alive(): 'nurse' is undefined!"); if (nb_type_check((PyObject *) Py_TYPE(nurse))) { +#if defined(NB_FREE_THREADED) + nb_shard &shard = internals->shard(inst_ptr((nb_inst *) nurse)); +#else + nb_shard &shard = internals->shards[0]; +#endif + nb_weakref_seq - **pp = (nb_weakref_seq **) &internals->keep_alive[nurse], - *s = (nb_weakref_seq *) PyObject_Malloc(sizeof(nb_weakref_seq)); + **pp = (nb_weakref_seq **) &shard.keep_alive[nurse], + *s = (nb_weakref_seq *) PyMem_Malloc(sizeof(nb_weakref_seq)); check(s, "nanobind::detail::keep_alive(): out of memory!"); s->payload = payload; @@ -1620,7 +1705,6 @@ PyObject *nb_type_put(const std::type_info *cpp_type, } nb_internals *internals_ = internals; - nb_ptr_map &inst_c2p = internals_->inst_c2p; type_data *td = nullptr; auto lookup_type = [cpp_type, internals_, &td]() -> bool { @@ -1634,7 +1718,11 @@ PyObject *nb_type_put(const std::type_info *cpp_type, }; if (rvp != rv_policy::copy) { + nb_shard &shard = internals_->shard(value); + lock_shard guard(shard); + // Check if the instance is already registered with nanobind + nb_ptr_map &inst_c2p = shard.inst_c2p; nb_ptr_map::iterator it = inst_c2p.find(value); if (it != inst_c2p.end()) { @@ -1694,7 +1782,6 @@ PyObject *nb_type_put_p(const std::type_info *cpp_type, // Check if the instance is already registered with nanobind nb_internals *internals_ = internals; - nb_ptr_map &inst_c2p = internals_->inst_c2p; // Look up the corresponding Python type type_data *td = nullptr, @@ -1715,7 +1802,11 @@ PyObject *nb_type_put_p(const std::type_info *cpp_type, }; if (rvp != rv_policy::copy) { + nb_shard &shard = internals_->shard(value); + lock_shard guard(shard); + // Check if the instance is already registered with nanobind + nb_ptr_map &inst_c2p = shard.inst_c2p; nb_ptr_map::iterator it = inst_c2p.find(value); if (it != inst_c2p.end()) { diff --git a/src/trampoline.cpp b/src/trampoline.cpp index 4a71b04e1..9671efd55 100644 --- a/src/trampoline.cpp +++ b/src/trampoline.cpp @@ -14,8 +14,13 @@ NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) void trampoline_new(void **data, size_t size, void *ptr) noexcept { - // GIL is held when the trampoline constructor runs - nb_ptr_map &inst_c2p = internals->inst_c2p; + // GIL is held when the trampoline constructor runs. Lock the + // associated instance shard in GIL-less Python. + + nb_shard &shard = internals->shard(ptr); + lock_shard lock(shard); + + nb_ptr_map &inst_c2p = shard.inst_c2p; nb_ptr_map::iterator it = inst_c2p.find(ptr); check(it != inst_c2p.end() && (((uintptr_t) it->second) & 1) == 0, "nanobind::detail::trampoline_new(): unique instance not found!"); @@ -36,6 +41,7 @@ static void trampoline_enter_internal(void **data, size_t size, PyGILState_STATE state{ }; const char *error = nullptr; PyObject *key = nullptr, *value = nullptr; + PyObject *self = (PyObject *) data[0]; PyTypeObject *value_tp = nullptr; size_t offset = 0; @@ -51,8 +57,7 @@ static void trampoline_enter_internal(void **data, size_t size, } else { if (pure) { error = "tried to call a pure virtual function"; - state = PyGILState_Ensure(); - goto fail; + break; } else { return; } @@ -62,6 +67,11 @@ static void trampoline_enter_internal(void **data, size_t size, // Nothing found -- retry, now with lock held state = PyGILState_Ensure(); + ft_object_guard guard(self); + + if (error) + goto fail; + for (size_t i = 0; i < size; i++) { void *d_name = data[2*i + 1], *d_value = data[2*i + 2]; @@ -101,7 +111,7 @@ static void trampoline_enter_internal(void **data, size_t size, goto fail; } - value = PyObject_GetAttr((PyObject *) data[0], key); + value = PyObject_GetAttr(self, key); if (!value) { error = "lookup failed"; goto fail; @@ -136,14 +146,14 @@ static void trampoline_enter_internal(void **data, size_t size, } fail: - type_data *td = nb_type_data(Py_TYPE((PyObject *) data[0])); + type_data *td = nb_type_data(Py_TYPE(self)); PyGILState_Release(state); raise("nanobind::detail::get_trampoline('%s::%s()'): %s!", td->name, name, error); } -NB_THREAD_LOCAL ticket *current_ticket = nullptr; +static NB_THREAD_LOCAL ticket *current_ticket = nullptr; void trampoline_enter(void **data, size_t size, const char *name, bool pure, ticket *t) { trampoline_enter_internal(data, size, name, pure, t); From 9cf1e8ebcaa04adda49afdeff2bb88aadb3fffc8 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:22:12 +0900 Subject: [PATCH 07/17] free-threading: Immortalize type and function objects Global objects that undergo a high rate of reference count changes can become a bottleneck in free-threaded Python extensions, since the associated atomic operation require coordination between processor cores. Function and type objects are a particular concern. This commit immortalizes such objects, which exempts them from free-threading. The downside of this is that they will leak. --- src/common.cpp | 16 ++++++++++++++++ src/nb_func.cpp | 2 ++ src/nb_internals.h | 2 ++ src/nb_type.cpp | 4 ++++ 4 files changed, 24 insertions(+) diff --git a/src/common.cpp b/src/common.cpp index a44a807c5..0253319d0 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1174,5 +1174,21 @@ bool issubclass(PyObject *a, PyObject *b) { return bool(rv); } +// ======================================================================== + +/// Make an object immortal when targeting free-threaded Python +void maybe_make_immortal(PyObject *op) { +#ifdef NB_FREE_THREADED + // See CPython's Objects/object.c + if (PyObject_IS_GC(op)) + PyObject_GC_UnTrack(op); + op->ob_tid = _Py_UNOWNED_TID; + op->ob_ref_local = _Py_IMMORTAL_REFCNT_LOCAL; + op->ob_ref_shared = 0; +#else + (void) op; +#endif +} + NAMESPACE_END(detail) NAMESPACE_END(NB_NAMESPACE) diff --git a/src/nb_func.cpp b/src/nb_func.cpp index 2571d006c..33ef98a2a 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -288,6 +288,8 @@ PyObject *nb_func_new(const void *in_) noexcept { check(func, "nb::detail::nb_func_new(\"%s\"): alloc. failed (1).", name_cstr); + maybe_make_immortal((PyObject *) func); + func->max_nargs = f->nargs; func->complex_call = f->nargs_pos < f->nargs || has_args || has_keep_alive; diff --git a/src/nb_internals.h b/src/nb_internals.h index 712ed1028..2a6fc491b 100644 --- a/src/nb_internals.h +++ b/src/nb_internals.h @@ -479,8 +479,10 @@ struct lock_obj { lock_obj(PyObject *) { } }; extern char *strdup_check(const char *); extern void *malloc_check(size_t size); +extern void maybe_make_immortal(PyObject *op); extern char *extract_name(const char *cmd, const char *prefix, const char *s); + NAMESPACE_END(detail) NAMESPACE_END(NB_NAMESPACE) diff --git a/src/nb_type.cpp b/src/nb_type.cpp index 340cc3826..7df7d3e71 100644 --- a/src/nb_type.cpp +++ b/src/nb_type.cpp @@ -869,6 +869,8 @@ static PyTypeObject *nb_type_tp(size_t supplement) noexcept { tp = (PyTypeObject *) nb_type_from_metaclass( internals_->nb_meta, internals_->nb_module, &spec); + maybe_make_immortal((PyObject *) tp); + handle(tp).attr("__module__") = "nanobind"; int rv = 1; @@ -1271,6 +1273,8 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { "failed: %s!", t_name, err.what()); } + maybe_make_immortal(result); + type_data *to = nb_type_data((PyTypeObject *) result); *to = *t; // note: slices off _init parts From e70cf49e871c0c5d6f4e4b6a5b46ff7c34c2bdfe Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:24:20 +0900 Subject: [PATCH 08/17] free-threading: Argument-level locking Adapting C++ to handle parallelism due to free-threaded Python can be tricky, especially when the original code is given as-is. This commit an tentative API to retrofit locking onto existing code by locking the arguments of function calls. --- docs/api_core.rst | 6 ++ docs/functions.rst | 42 ++++++++++ include/nanobind/nb_attr.h | 63 ++++++++++++--- include/nanobind/nb_cast.h | 14 ---- src/nb_func.cpp | 160 +++++++++++++++++++++++++++---------- 5 files changed, 218 insertions(+), 67 deletions(-) diff --git a/docs/api_core.rst b/docs/api_core.rst index 81f3a5a30..15b8e6492 100644 --- a/docs/api_core.rst +++ b/docs/api_core.rst @@ -1638,6 +1638,12 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, explain it in docstrings and stubs (``str(value)``) does not produce acceptable output. + .. cpp:function:: arg &lock(bool value = true) + + Set a flag noting that this argument must be locked when dispatching a + function call in free-threaded Python extensions. It does nothing in + regular GIL-protected extensions. + .. cpp:struct:: is_method Indicate that the bound function is a method. diff --git a/docs/functions.rst b/docs/functions.rst index 8784ff47e..5061a945c 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -545,3 +545,45 @@ The following interactive session shows how to call them from Python. C++ libraries (e.g. GUI libraries, asynchronous networking libraries, etc.). +.. _binding-overheads: + +Minimizing binding overheads +---------------------------- + +The code that dispatches function calls from Python to C++ is in general +:ref:`highly optimized `. When it is important to further reduce +binding overheads to an absolute minimum, consider removing annotations for +:ref:`keyword and default arguments ` along with +other advanced binding annotations. + +In the snippet below, ``f1`` has lower binding overheads compared to ``f2``. + +.. code-block:: cpp + + NB_MODULE(my_ext, m) { + m.def("f1", [](int) { /* no-op */ }); + m.def("f2", [](int) { /* no-op */ }, "arg"_a); + } + +This is because ``f1``: + +- does not accept keyword arguments and does not specify :ref:`default argument + values `. + +- has no :cpp:class:`nb::keep_alive\() ` or + :ref:`argument locking ` annotations. + +- takes no variable-length positional (:cpp:class:`nb::args `) or keyword + (:cpp:class:`nb::kwargs `) arguments. + +- has 8 or fewer arguments. + +If all of the above conditions are satisfied, nanobind switches to a +specialized dispatcher that is optimized to handle a small number of positional +arguments. Otherwise, it uses the default dispatcher that works in any +situation. It is also worth noting that functions with many overloads generally +execute more slowly, since nanobind must first select a suitable one. + +These differences are mainly of interest when a function that does *very +little* is called at a *very high rate*, in which case binding overheads can +become noticeable. diff --git a/include/nanobind/nb_attr.h b/include/nanobind/nb_attr.h index 72c717720..36a2dfb59 100644 --- a/include/nanobind/nb_attr.h +++ b/include/nanobind/nb_attr.h @@ -32,6 +32,11 @@ struct arg { return *this; } + NB_INLINE arg &lock(bool value = true) { + lock_ = value; + return *this; + } + NB_INLINE arg &sig(const char *value) { signature_ = value; return *this; @@ -40,6 +45,7 @@ struct arg { const char *name_, *signature_; uint8_t convert_{ true }; bool none_{ false }; + bool lock_{ false }; }; struct arg_v : arg { @@ -62,6 +68,7 @@ struct is_flag {}; struct is_final {}; struct is_generic {}; struct kw_only {}; +struct lock_self {}; template struct keep_alive {}; template struct supplement {}; @@ -126,16 +133,38 @@ enum class func_flags : uint32_t { /// Does this overload specify a custom function signature (for docstrings, typing) has_signature = (1 << 16), /// Does this function have one or more nb::keep_alive() annotations? - has_keep_alive = (1 << 17) + has_keep_alive = (1 << 17), + /// Free-threaded Python: does the binding lock the 'self' argument + lock_self = (1 << 18) }; +enum cast_flags : uint8_t { + // Enable implicit conversions (code assumes this has value 1, don't reorder..) + convert = (1 << 0), + + // Passed to the 'self' argument in a constructor call (__init__) + construct = (1 << 1), + + // Indicates that the function dispatcher should accept 'None' arguments + accepts_none = (1 << 2), + + // Indicates that a function argument must be locked before dispatching a call + lock = (1 << 3), + + // Indicates that this cast is performed by nb::cast or nb::try_cast. + // This implies that objects added to the cleanup list may be + // released immediately after the caster's final output value is + // obtained, i.e., before it is used. + manual = (1 << 4) +}; + + struct arg_data { const char *name; const char *signature; PyObject *name_py; PyObject *value; - bool convert; - bool none; + uint8_t flag; }; template struct func_data_prelim { @@ -266,27 +295,37 @@ NB_INLINE void func_extra_apply(F &, std::nullptr_t, size_t &) { } template NB_INLINE void func_extra_apply(F &f, const arg &a, size_t &index) { - arg_data &arg = f.args[index++]; + uint8_t flag = 0; + if (a.none_) + flag |= (uint8_t) cast_flags::accepts_none; + if (a.convert_) + flag |= (uint8_t) cast_flags::convert; + if (a.lock_) + flag |= (uint8_t) cast_flags::lock; + + arg_data &arg = f.args[index]; + arg.flag = flag; arg.name = a.name_; arg.signature = a.signature_; arg.value = nullptr; - arg.convert = a.convert_; - arg.none = a.none_; + index++; } template NB_INLINE void func_extra_apply(F &f, const arg_v &a, size_t &index) { - arg_data &arg = f.args[index++]; - arg.name = a.name_; - arg.signature = a.signature_; - arg.value = a.value.ptr(); - arg.convert = a.convert_; - arg.none = a.none_; + arg_data &ad = f.args[index]; + func_extra_apply(f, (const arg &) a, index); + ad.value = a.value.ptr(); } template NB_INLINE void func_extra_apply(F &, kw_only, size_t &) {} +template +NB_INLINE void func_extra_apply(F &f, lock_self, size_t &) { + f.flags |= (uint32_t) func_flags::lock_self; +} + template NB_INLINE void func_extra_apply(F &, call_guard, size_t &) {} diff --git a/include/nanobind/nb_cast.h b/include/nanobind/nb_cast.h index ec73f051a..7001e4ac0 100644 --- a/include/nanobind/nb_cast.h +++ b/include/nanobind/nb_cast.h @@ -32,20 +32,6 @@ NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) -enum cast_flags : uint8_t { - // Enable implicit conversions (impl. assumes this is 1, don't reorder..) - convert = (1 << 0), - - // Passed to the 'self' argument in a constructor call (__init__) - construct = (1 << 1), - - // Indicates that this cast is performed by nb::cast or nb::try_cast. - // This implies that objects added to the cleanup list may be - // released immediately after the caster's final output value is - // obtained, i.e., before it is used. - manual = (1 << 2), -}; - /** * Type casters expose a member 'Cast' which users of a type caster must * query to determine what the caster actually can (and prefers) to produce. diff --git a/src/nb_func.cpp b/src/nb_func.cpp index 33ef98a2a..3cc05cd51 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -142,8 +142,8 @@ void nb_bound_method_dealloc(PyObject *self) { } static arg_data method_args[2] = { - { "self", nullptr, nullptr, nullptr, false, false }, - { nullptr, nullptr, nullptr, nullptr, false, false } + { "self", nullptr, nullptr, nullptr, 0 }, + { nullptr, nullptr, nullptr, nullptr, 0 } }; static bool set_builtin_exception_status(builtin_exception &e) { @@ -188,6 +188,50 @@ char *strdup_check(const char *s) { return result; } +#if defined(NB_FREE_THREADED) +// Locked function wrapper for free-threaded Python +struct locked_func { + void *capture[3]; + size_t nargs; + PyObject *(*impl)(void *, PyObject **, uint8_t *, rv_policy, + cleanup_list *); + void (*free_capture)(void *); +}; + +static void locked_func_free(void *p) { + locked_func *f = (locked_func *) ((void **) p)[0]; + if (f->free_capture) + f->free_capture(f->capture); + PyMem_Free(f); +} + +static PyObject *locked_func_impl(void *p, PyObject **args, uint8_t *args_flags, + rv_policy policy, cleanup_list *cleanup) { + handle h1, h2; + size_t ctr = 0; + + locked_func *f = (locked_func *) ((void **) p)[0]; + + for (size_t i = 0; i < f->nargs; ++i) { + if (args_flags[i] & (uint8_t) cast_flags::lock) { + if (ctr == 0) + h1 = args[i]; + h2 = args[i]; + ctr++; + } + } + +#if !defined(NDEBUG) + // nb_func_new ensured that at most two arguments are locked, but + // can't hurt to check once more in debug builds + check(ctr == 1 || ctr == 2, "locked_call: expected 1 or 2 locked arguments!"); +#endif + + ft_object2_guard guard(h1, h2); + return f->impl(f->capture, args, args_flags, policy, cleanup); +} +#endif + /** * \brief Wrap a C++ function into a Python function object * @@ -206,6 +250,7 @@ PyObject *nb_func_new(const void *in_) noexcept { is_implicit = f->flags & (uint32_t) func_flags::is_implicit, is_method = f->flags & (uint32_t) func_flags::is_method, return_ref = f->flags & (uint32_t) func_flags::return_ref, + lock_self = f->flags & (uint32_t) func_flags::lock_self, is_constructor = false, is_init = false, is_new = false, @@ -273,7 +318,7 @@ PyObject *nb_func_new(const void *in_) noexcept { if (is_constructor && f->nargs == 2 && f->descr_types[0] && f->descr_types[0] == f->descr_types[1]) { if (has_args) { - f->args[0].convert = false; + f->args[0].flag &= ~(uint8_t) cast_flags::convert; } else { args_in = method_args + 1; has_args = true; @@ -377,6 +422,7 @@ PyObject *nb_func_new(const void *in_) noexcept { } } + size_t lock_count = 0; if (has_args) { fc->args = (arg_data *) malloc_check(sizeof(arg_data) * f->nargs); @@ -393,12 +439,57 @@ PyObject *nb_func_new(const void *in_) noexcept { } else { a.name_py = nullptr; } - a.none |= a.value == Py_None; + if (a.value == Py_None) + a.flag |= (uint8_t) cast_flags::accepts_none; + if (a.flag & (uint8_t) cast_flags::lock) + lock_count++; a.signature = a.signature ? strdup_check(a.signature) : nullptr; Py_XINCREF(a.value); } } + if (lock_self) { + check(is_method && !is_init, + "nb::detail::nb_func_new(\"%s\"): the nb::lock_self annotation only " + "applies to regular methods.", name_cstr); + +#if defined(NB_FREE_THREADED) + // Must potentially allocate dummy 'args' if 'lock_self' is set + if (!has_args) { + fc->args = (arg_data *) malloc_check(sizeof(arg_data) * f->nargs); + memset(fc->args, 0, sizeof(arg_data) * f->nargs); + for (uint32_t i = 1; i < f->nargs; ++i) + fc->args[i].flag &= (uint8_t) cast_flags::convert; + func->vectorcall = nb_func_vectorcall_complex; + fc->flags |= (uint32_t) func_flags::has_args; + has_args = true; + } + + fc->args[0].flag |= (uint8_t) cast_flags::lock; +#endif + + lock_count++; + } + + check(lock_count <= 2, + "nb::detail::nb_func_new(\"%s\"): at most two function arguments can " + "be locked.", name_cstr); + +#if defined(NB_FREE_THREADED) + if (lock_count) { + locked_func *lf = (locked_func *) PyMem_Malloc(sizeof(locked_func)); + check(lf, "nb::detail::nb_func_new(\"%s\"): locked function alloc. failed.", name_cstr); + memcpy(lf->capture, fc->capture, sizeof(func_data::capture)); + lf->nargs = fc->nargs; + lf->impl = fc->impl; + lf->free_capture = (fc->flags & (uint32_t) func_flags::has_free) ? fc->free_capture : nullptr; + fc->impl = locked_func_impl; + fc->free_capture = locked_func_free; + fc->flags |= (uint32_t) func_flags::has_free; + fc->capture[0] = lf; + } +#endif + // Fast path for vector call object construction if (((is_init && is_method) || (is_new && !is_method)) && nb_type_check(f->scope)) { @@ -674,8 +765,8 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, continue; // skip nb::args parameter, will be handled below PyObject *arg = nullptr; - bool arg_convert = pass == 1, - arg_none = false; + + uint8_t arg_flag = 1; // If i >= nargs_pos, then this is a keyword-only parameter. // (We skipped any *args parameter using the test above, @@ -709,16 +800,15 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, if (!arg) arg = ad.value; - - arg_convert &= ad.convert; - arg_none = ad.none; + arg_flag = ad.flag; } - if (!arg || (arg == Py_None && !arg_none)) + if (!arg || (arg == Py_None && (arg_flag & cast_flags::accepts_none) == 0)) break; + // Implicit conversion only active in the 2nd pass + args_flags[i] = arg_flag & ~uint8_t(pass == 0); args[i] = arg; - args_flags[i] = arg_convert ? (uint8_t) cast_flags::convert : (uint8_t) 0; } // Skip this overload if any arguments were unavailable @@ -761,33 +851,28 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, continue; } + if (is_constructor) - args_flags[0] = (uint8_t) cast_flags::construct; + args_flags[0] |= (uint8_t) cast_flags::construct; + + rv_policy policy = (rv_policy) (f->flags & 0b111); try { + result = nullptr; + // Found a suitable overload, let's try calling it result = f->impl((void *) f->capture, args, args_flags, - (rv_policy) (f->flags & 0b111), &cleanup); + policy, &cleanup); - if (NB_UNLIKELY(!result)) { + if (NB_UNLIKELY(!result)) error_handler = nb_func_error_noconvert; - goto done; - } } catch (builtin_exception &e) { - if (set_builtin_exception_status(e)) { - result = nullptr; - goto done; - } else { + if (!set_builtin_exception_status(e)) result = NB_NEXT_OVERLOAD; - } } catch (python_error &e) { e.restore(); - result = nullptr; - goto done; } catch (...) { nb_func_convert_cpp_exception(); - result = nullptr; - goto done; } if (result != NB_NEXT_OVERLOAD) { @@ -851,7 +936,7 @@ static PyObject *nb_func_vectorcall_simple(PyObject *self, goto done; } - for (int pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { + for (size_t pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { for (int i = 0; i < NB_MAXARGS_SIMPLE; ++i) args_flags[i] = (uint8_t) pass; @@ -865,30 +950,22 @@ static PyObject *nb_func_vectorcall_simple(PyObject *self, continue; try { + result = nullptr; + // Found a suitable overload, let's try calling it result = f->impl((void *) f->capture, (PyObject **) args_in, args_flags, (rv_policy) (f->flags & 0b111), &cleanup); - if (NB_UNLIKELY(!result)) { + if (NB_UNLIKELY(!result)) error_handler = nb_func_error_noconvert; - goto done; - } } catch (builtin_exception &e) { - if (set_builtin_exception_status(e)) { - result = nullptr; - goto done; - } else { + if (!set_builtin_exception_status(e)) result = NB_NEXT_OVERLOAD; - } } catch (python_error &e) { e.restore(); - result = nullptr; - goto done; } catch (...) { nb_func_convert_cpp_exception(); - result = nullptr; - goto done; } if (result != NB_NEXT_OVERLOAD) { @@ -1084,8 +1161,9 @@ static uint32_t nb_func_render_signature(const func_data *f, } buf.put(": "); - if (has_args && f->args[arg_index].none) { - #if PY_VERSION_HEX < 0x030A0000 + if (has_args && f->args[arg_index].flag & + (uint8_t) cast_flags::accepts_none) { +#if PY_VERSION_HEX < 0x030A0000 buf.put("typing.Optional["); #else // See below @@ -1097,7 +1175,7 @@ static uint32_t nb_func_render_signature(const func_data *f, case '}': // Default argument if (has_args) { - if (f->args[arg_index].none) { + if (f->args[arg_index].flag & (uint8_t) cast_flags::accepts_none) { #if PY_VERSION_HEX < 0x030A0000 buf.put(']'); #else From 50771a7342a40f717744b045bf93ce84059823bb Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:24:42 +0900 Subject: [PATCH 09/17] free-threading: Documentation This commit documents free-threading in general and in the context of nanobind extensions. --- docs/changelog.rst | 17 ++- docs/free_threaded.rst | 289 +++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 3 files changed, 304 insertions(+), 3 deletions(-) create mode 100644 docs/free_threaded.rst diff --git a/docs/changelog.rst b/docs/changelog.rst index 28d80ad71..5df92acd6 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -18,7 +18,16 @@ below inherit that of the preceding release. Version 2.2.0 (TBA) ------------------- -* nanobind has always used `PEP 590 vector calls +- nanobind can now target `free-threaded Python + `__, which replaces the `Global + Interpreter Lock (GIL) + `__ with a + fine-grained locking scheme (see `PEP 703 + `__) to better leverage multi-core + parallelism. A `separate documation page `__ explains this in + detail. + +- nanobind has always used `PEP 590 vector calls `__ to efficiently dispatch calls to function and method bindings, but it lacked the ability to do so for constructors (e.g., ``MyType(arg1, arg2, ...)``). @@ -46,8 +55,10 @@ Version 2.2.0 (TBA) - **Memory order**: :cpp:class:`c_contig`, :cpp:class:`f_contig`. - **Shape**: :cpp:class:`nb::shape\<3, 4, 5\> `, etc. - - **Device type**: :cpp:class:`nb::device::cpu `, :cpp:class:`nb::device::cuda `, etc. - - **Framework**: :cpp:class:`nb::numpy `, :cpp:class:`nb::pytorch `, etc. + - **Device type**: :cpp:class:`nb::device::cpu `, + :cpp:class:`nb::device::cuda `, etc. + - **Framework**: :cpp:class:`nb::numpy `, + :cpp:class:`nb::pytorch `, etc. - **Data type**: ``uint64_t``, ``std::complex``, etc. Previously, only the **framework** and **data type** annotations were diff --git a/docs/free_threaded.rst b/docs/free_threaded.rst new file mode 100644 index 000000000..356990408 --- /dev/null +++ b/docs/free_threaded.rst @@ -0,0 +1,289 @@ +.. _free-threaded: + +.. cpp:namespace:: nanobind + +Free-threaded Python +==================== + +**Free-threading** is an experimental new Python feature that replaces the +`Global Interpreter Lock (GIL) +`__ with a fine-grained +locking scheme to better leverage multi-core parallelism. The resulting +benefits do not come for free: extensions must explicitly opt-in and generally +require careful modifications to ensure correctness. + +Nanobind can target free-threaded Python since version 2.2.0. This page +explains how to do so and discusses a few caveats. Besides this page, make sure +to review `py-free-threading.github.io `__ +for a more comprehensive discussion of free-threaded Python. `PEP 703 +`__ explains the nitty gritty details. + +Opting in +--------- + +To opt into free-threaded Python, pass the ``FREE_THREADED`` parameter to the +:cmake:command:`nanobind_add_module()` CMake target command. For other build +systems, refer to their respective documentation pages. + +.. code-block:: cmake + + nanobind_add_module( + my_ext # Target name + FREE_THREADED # Opt into free-threading + my_ext.h # Source code files below + my_ext.cpp) + +nanobind ignores the ``FREE_THREADED`` parameter when the registered Python +version does not support free-threading. + +.. note:: + + **Stable ABI**: Note that there currently is no stable ABI for free-threaded + Python, hence the ``STABLE_ABI`` parameter will be ignored in free-threaded + extensions builds. It is valid to combine the ``STABLE_ABI`` and + ``FREE_THREADED`` arguments: the build system will choose between the two + depending on the detected Python version. + + Warning: Loading a non-free threaded extension into a free-threaded Python + build disables free-threading globally. + +If free-threading was requested and is available, the build system will set the +``NB_FREE_THREADED`` preprocessor flag. This can be helpful to specialize +binding code with ``#ifdef`` blocks, e.g.: + +.. code-block:: cpp + + #if !defined(NB_FREE_THREADED) + ... // simple GIL-protected code + #else + ... // more complex thread-aware code + #endif + +Caveats +------- + +Free-threading can violate implicit assumptions made by extension developers +when previously serial operations suddenly run concurrently, producing +undefined behavior (race conditions, crashes, etc.). + +Let's consider a concrete example: the binding code below defines a ``Counter`` +class with an increment operation. + +.. code-block:: cpp + + struct Counter { + int value = 0; + void inc() { value++; } + }; + + nb::class_(m, "Counter") + .def("inc", &Counter::inc) + .def_ro("value", &Counter::value); + + +If multiple threads call the ``inc()`` method of a single ``Counter``, the +final count will generally be incorrect, as the increment operation ``value++`` +does not execute atomically. + +To fix this, we could modify the C++ type so that it protects its ``value`` +member from concurrent modification, for example using an atomic number type +(e.g., ``std::atomic``) or a critical section (e.g., based on +``std::mutex``). + +The race condition in the above example is relatively benign. However, +in more complex projects, combinations of concurrency and unsafe memory +accesses could introduce non-deterministic data corruption and crashes. + +Another common source of problems are *global variables* undergoing concurrent +modification when no longer protected by the GIL. They will likewise require +supplemental locking. The :ref:`next section ` explains a +Python-specific locking primitive that can be used in binding code besides +the solutions mentioned above. + +.. _free-threaded-locks: + +Python locks +------------ + +Nanobind provides convenience functionality encapsulating the mutex +implementation that is part of Python ("``PyMutex``"). It is slightly more +efficient than OS/language-provided synchronization primitives and generally +preferable within Python extensions. + +The class :cpp:class:`ft_mutex` is analogous to ``std::mutex``, and +:cpp:class:`ft_lock_guard` is analogous to ``std::lock_guard``. Note that they +only exist to add *supplemental* critical sections needed in free-threaded +Python, while becoming inactive (no-ops) when targeting regular GIL-protected +Python. + +With these abstractions, the previous ``Counter`` implementation could be +rewritten as: + +.. code-block:: cpp + :emphasize-lines: 3,6 + + struct Counter { + int value = 0; + nb::ft_mutex mutex; + + void inc() { + nb::ft_lock_guard guard(mutex); + value++; + } + }; + +These locks are very compact (``sizeof(nb::ft_mutex) == 1``), though this is a +Python implementation detail that could change in the future. + +.. _argument-locks: + +Argument locking +---------------- + +Modifying class and function definitions as shown above may not always be +possible. As an alternative, nanobind also provides a way to *retrofit* +supplemental locking onto existing code. The idea is to lock individual +arguments of a function *before* being allowed to invoke it. A built-in mutex +present in every Python object enables this. + +To do so, call the :cpp:func:`.lock() ` member of +:cpp:class:`nb::arg() ` annotations to indicate that an +argument must be locked, e.g.: + +- ``nb::arg("my_parameter").lock()`` +- ``"my_parameter"_a.lock()`` (short-hand form) + +In methods bindings, pass :cpp:struct:`nb::lock_self() ` to lock +the implicit ``self`` argument. Note that at most 2 arguments can be +locked per function, which is a limitation of the `Python locking API +`__. + +The example below shows how this functionality can be used to protect ``inc()`` +and a new ``merge()`` function that acquires two simultaneous locks. + +.. code-block:: cpp + + struct Counter { + int value = 0; + + void inc() { value++; } + void merge(Counter &other) { + value += other.value; + other.value = 0; + } + }; + + nb::class_(m, "Counter") + .def("inc", &Counter::inc, nb::lock_self()) + .def("merge", &Counter::merge, nb::lock_self(), "other"_a.lock()) + .def_ro("value", &Counter::value); + +The above solution has an obvious drawback: it only protects *bindings* (i.e., +transitions from Python to C++). For example, if some other part of a C++ +codebase calls ``merge()`` directly, the binding layer won't be involved, and +no locking takes place. If such behavior can introduce race conditions, a +larger-scale redesign of your project may be in order. + +.. note:: + + Adding locking annotations indiscriminately is inadvisable because they add + a runtime cost to function call dispatcher. + The :cpp:func:`.lock() ` annotation is ignored in GIL-protected + builds. Note that listing arguments in function bindings generally comes at + a small cost in terms of :ref:`binding overheads `. + +.. note:: + + **Python API and locking**: When the lock-protected function performs Python + API calls (e.g., using :ref:`wrappers ` like :cpp:class:`nb::dict + `), Python may temporarily release locks to avoid deadlocks. Here, + even basic reference counting such as a :cpp:class:`nb::object + ` variable expiring at the end of a scope counts as an API call. + + These locks will be reacquired following the Python API call. This behavior + resembles ordinary (GIL-protected) Python code, where operations like + `Py_DECREF() + `__ can cause + cause arbitrary Python code to execute. The semantics of this kind of + relaxed critical section are described in the `Python documentation + `__. + +Miscellaneous notes +------------------- + +API +--- + +The following API specific to free-threading has been added: + +- :cpp:class:`nb::ft_mutex ` +- :cpp:class:`nb::ft_lock_guard ` +- :cpp:class:`nb::ft_object_guard ` +- :cpp:class:`nb::ft_object2_guard ` +- :cpp:func:`nb::arg::lock() ` + +API stability +_____________ + +The interface explained in this is excluded from the project's semantic +versioning policy. Free-threading is still experimental, and API breaks may be +necessary based on future experience and changes in Python itself. + +Wrappers +________ + +:ref:`Wrapper types ` like :cpp:class:`nb::list ` may used in +multi-threaded code. Operations like :cpp:func:`nb::list::append() +` internally acquire locks and behave just like their ordinary +Python counterparts. This means that race conditions can still occur without +larger-scale synchronization, but such races won't jeopardize the memory safety +of the program. + +GIL scope guards +________________ + +Prior to free-threaded Python, the nanobind scope guards +:cpp:struct:`gil_scoped_acquire` and :cpp:struct:`gil_scoped_release` would +normally be used to acquire/release the GIL and enable parallel regions. + +These remain useful and should not be removed from existing code: while no +longer blocking operations, they set and unset the current Python thread +context and inform the garbage collector. + +The :cpp:struct:`gil_scoped_release` RAII scope guard class plays a special +role in free-threaded builds, since it releases all :ref:`argument locks +` held by the current thread. + +Immortalization +_______________ + +Python relies on a technique called *reference counting* to determine when an +object is no longer needed. This approach can become a bottleneck in +multi-threaded programs, since increasing and decreasing reference counts +requires coordination among multiple processor cores. Python type and function +objects are especially sensitive, since their reference counts change at a very +high rate. + +Similar to free-threaded Python itself, nanobind avoids this bottleneck by +*immortalizing* functions (``nanobind.nb_func``, ``nanobind.nb_method``) and +type bindings. Immortal objects don't require reference counting. In turn, the +downside is that they leak when the interpreter shuts down. Free-threaded +nanobind extensions disable the internal :ref:`leak checker `, +since it would produce many warning messages caused by immortal objects. + +Internal data structures +________________________ + +Nanobind maintains various internal data structures that store information +about instances and function/type bindings. These data structures also play an +important role to exchange type/instance data in larger projects that are split +across several independent extension modules. + +The layout of these data structures differs between ordinary and free-threaded +extensions, therefore nanobind isolates them from each other by assigning a +different ABI version tag. This means that multi-module projects will need +to consistently compile either free-threaded or non-free-threaded modules. + +Free-threaded nanobind uses thread-local and sharded data structures to avoid +lock and atomic contention on the internal data structures, which would +otherwise become a bottleneck in multi-threaded Python programs. diff --git a/docs/index.rst b/docs/index.rst index e9b53e019..221338280 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -132,6 +132,7 @@ The nanobind logo was designed by `AndoTwin Studio :caption: Advanced :maxdepth: 1 + free_threaded ownership_adv lowlevel typeslots From dd43fe20a4f888c952a54940425ce658c916c5f9 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Sun, 8 Sep 2024 10:25:21 +0900 Subject: [PATCH 10/17] free-threading: Test suite Several parallel tests to check that locking works as expected --- tests/CMakeLists.txt | 2 + tests/test_thread.cpp | 37 ++++++++++++++++ tests/test_thread.py | 100 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 tests/test_thread.cpp create mode 100644 tests/test_thread.py diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 785db96d2..f306cda7a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -53,6 +53,7 @@ set(TEST_NAMES typing issue intrusive + thread ) foreach (NAME ${TEST_NAMES}) @@ -130,6 +131,7 @@ set(TEST_FILES test_ndarray.py test_stubs.py test_typing.py + test_thread.py # Stub reference files test_classes_ext.pyi.ref diff --git a/tests/test_thread.cpp b/tests/test_thread.cpp new file mode 100644 index 000000000..4ac012943 --- /dev/null +++ b/tests/test_thread.cpp @@ -0,0 +1,37 @@ +#include + +namespace nb = nanobind; +using namespace nb::literals; + +struct Counter { + size_t value = 0; + void inc() { value++; } + void merge(Counter &o) { + value += o.value; + o.value = 0; + } +}; + +nb::ft_mutex mutex; + +NB_MODULE(test_thread_ext, m) { + nb::class_(m, "Counter") + .def(nb::init<>()) + .def_ro("value", &Counter::value) + .def("inc_unsafe", &Counter::inc) + .def("inc_safe", &Counter::inc, nb::lock_self()) + .def("merge_unsafe", &Counter::merge) + .def("merge_safe", &Counter::merge, nb::lock_self(), "o"_a.lock()); + + m.def("return_self", [](Counter *c) -> Counter * { return c; }); + + m.def("inc_safe", + [](Counter &c) { c.inc(); }, + "counter"_a.lock()); + + m.def("inc_global", + [](Counter &c) { + nb::ft_lock_guard guard(mutex); + c.inc(); + }, "counter"); +} diff --git a/tests/test_thread.py b/tests/test_thread.py new file mode 100644 index 000000000..f9fd85f16 --- /dev/null +++ b/tests/test_thread.py @@ -0,0 +1,100 @@ +import test_thread_ext as t +from test_thread_ext import Counter + +import threading + +# Helper function to parallelize execution of a function. We intentionally +# don't use the Python threads pools here to have threads shut down / start +# between test cases. +def parallelize(func, n_threads): + barrier = threading.Barrier(n_threads) + result = [None]*n_threads + + def wrapper(i): + barrier.wait() + result[i] = func() + + workers = [] + for i in range(n_threads): + t = threading.Thread(target=wrapper, args=(i,)) + t.start() + workers.append(t) + + for worker in workers: + worker.join() + return result + + +def test01_object_creation(n_threads=8): + # This test hammers 'inst_c2p' from multiple threads, and + # checks that the locking of internal data structures works + + n = 100000 + def f(): + r = [None]*n + for i in range(n): + c = Counter() + c.inc_unsafe() + r[i] = c + for i in range(n): + assert t.return_self(r[i]) is r[i] + return r + + v = parallelize(f, n_threads=n_threads) + assert len(v) == n_threads + for v2 in v: + assert len(v2) == n + for v3 in v2: + assert v3.value == 1 + +def test02_global_lock(n_threads=8): + # Test that a global PyMutex protects the counter + n = 100000 + c = Counter() + def f(): + for i in range(n): + t.inc_global(c) + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads + + +def test03_locked_method(n_threads=8): + # Checks that nb::lock_self() protects an internal counter + n = 100000 + c = Counter() + def f(): + for i in range(n): + c.inc_safe() + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads + + +def test04_locked_function(n_threads=8): + # Checks that nb::lock_self() protects an internal counter + n = 100000 + c = Counter() + def f(): + for i in range(n): + t.inc_safe(c) + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads + + +def test05_locked_twoargs(n_threads=8): + # Check two-argument locking + n = 100000 + c = Counter() + def f(): + c2 = Counter() + for i in range(n): + c2.inc_unsafe() + if i & 1 == 0: + c2.merge_safe(c) + else: + c.merge_safe(c2) + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads From 302ab7ca67db45efd2dcf19ae03ce5f176c76a4d Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Fri, 13 Sep 2024 10:35:28 +0900 Subject: [PATCH 11/17] minor fixes, added entry in why.rst --- docs/free_threaded.rst | 9 ++++++--- docs/why.rst | 7 +++++++ src/nb_enum.cpp | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/free_threaded.rst b/docs/free_threaded.rst index 356990408..7acbb42c0 100644 --- a/docs/free_threaded.rst +++ b/docs/free_threaded.rst @@ -44,8 +44,11 @@ version does not support free-threading. ``FREE_THREADED`` arguments: the build system will choose between the two depending on the detected Python version. - Warning: Loading a non-free threaded extension into a free-threaded Python - build disables free-threading globally. +.. warning:: + + Loading an Python extension that does not support free-threading disables + free-threading globally. In larger binding projects with multiple + extensions, all of them must be adapted. If free-threading was requested and is available, the build system will set the ``NB_FREE_THREADED`` preprocessor flag. This can be helpful to specialize @@ -232,7 +235,7 @@ necessary based on future experience and changes in Python itself. Wrappers ________ -:ref:`Wrapper types ` like :cpp:class:`nb::list ` may used in +:ref:`Wrapper types ` like :cpp:class:`nb::list ` may be used in multi-threaded code. Operations like :cpp:func:`nb::list::append() ` internally acquire locks and behave just like their ordinary Python counterparts. This means that race conditions can still occur without diff --git a/docs/why.rst b/docs/why.rst index e7819184b..f20104e77 100644 --- a/docs/why.rst +++ b/docs/why.rst @@ -91,6 +91,13 @@ performance improvements: bottleneck. With nanobind's split into a precompiled library and minimal metatemplating, LTO is no longer crucial and can be skipped. +- **Free-threading**: Python 3.13+ supports a free-threaded mode that removes + the *Global Interpreter Lock* (GIL). Both pybind11 and nanobind support + free-threading as of recently. When comparing the two, nanobind provides + better multi-core scaling using a localized locking scheme. In pybind11, lock + contention on a central ``internals`` data structure used in every binding + operation becomes a bottleneck in practice. + - **Lifetime management**: nanobind maintains efficient internal data structures for lifetime management (needed for :cpp:class:`nb::keep_alive `, :cpp:enumerator:`nb::rv_policy::reference_internal diff --git a/src/nb_enum.cpp b/src/nb_enum.cpp index 87ef899c8..4d4c11613 100644 --- a/src/nb_enum.cpp +++ b/src/nb_enum.cpp @@ -23,7 +23,7 @@ PyObject *enum_create(enum_init_data *ed) noexcept { { lock_internals guard(internals_); - std::tie(it, success) = internals->type_c2p_slow.try_emplace(ed->type, nullptr); + std::tie(it, success) = internals_->type_c2p_slow.try_emplace(ed->type, nullptr); if (!success) { PyErr_WarnFormat(PyExc_RuntimeWarning, 1, "nanobind: type '%s' was already registered!\n", From 13682cb796422e1dc915519577e661be928b8227 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Mon, 16 Sep 2024 07:33:24 +0900 Subject: [PATCH 12/17] incorporate review feedback --- cmake/darwin-ld-cpython.sym | 6 ++++++ include/nanobind/nb_defs.h | 4 ++++ include/nanobind/nb_lib.h | 2 ++ src/common.cpp | 15 +++++++++++++++ src/nb_enum.cpp | 2 ++ src/nb_func.cpp | 37 +++++++++++++++++-------------------- src/nb_internals.cpp | 20 ++++++++++++++------ src/nb_internals.h | 6 +++++- src/nb_type.cpp | 26 +++++++++++++++++++++----- 9 files changed, 86 insertions(+), 32 deletions(-) diff --git a/cmake/darwin-ld-cpython.sym b/cmake/darwin-ld-cpython.sym index cdd1cbe8a..6cd8c2ae4 100644 --- a/cmake/darwin-ld-cpython.sym +++ b/cmake/darwin-ld-cpython.sym @@ -908,6 +908,12 @@ -U __Py_INCREF_IncRefTotal -U __PyObject_GetDictPtr -U _PyList_GetItemRef +-U _PyDict_GetItemRef +-U _PyDict_GetItemStringRef +-U _PyDict_SetDefault +-U _PyDict_SetDefaultRef +-U _PyWeakref_GetRef +-U _PyImport_AddModuleRef -U _PyUnstable_Module_SetGIL -U _PyMutex_Unlock -U _PyMutex_Lock diff --git a/include/nanobind/nb_defs.h b/include/nanobind/nb_defs.h index c49b7bb0b..d6395e696 100644 --- a/include/nanobind/nb_defs.h +++ b/include/nanobind/nb_defs.h @@ -147,6 +147,10 @@ # error "nanobind requires a newer PyPy version (>= 7.3.10)" #endif +#if defined(NB_FREE_THREADED) && !defined(Py_GIL_DISABLED) +# error "Free-threaded extensions require a free-threaded version of Python" +#endif + #if defined(NB_DOMAIN) # define NB_DOMAIN_STR NB_TOSTRING(NB_DOMAIN) #else diff --git a/include/nanobind/nb_lib.h b/include/nanobind/nb_lib.h index 358ebf837..3541bce8d 100644 --- a/include/nanobind/nb_lib.h +++ b/include/nanobind/nb_lib.h @@ -555,6 +555,8 @@ NB_CORE bool is_alive() noexcept; NB_CORE void *type_get_slot(PyTypeObject *t, int slot_id); #endif +NB_CORE PyObject *dict_get_item_ref_or_fail(PyObject *d, PyObject *k); + NAMESPACE_END(detail) using detail::raise; diff --git a/src/common.cpp b/src/common.cpp index 0253319d0..060be1511 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1190,5 +1190,20 @@ void maybe_make_immortal(PyObject *op) { #endif } +// ======================================================================== + +PyObject *dict_get_item_ref_or_fail(PyObject *d, PyObject *k) { + PyObject *value; + bool error; +#if PY_VERSION_HEX < 0x030D0000 + value = PyDict_GetItemWithError(d, k); + error = !value && PyErr_Occurred(); +#else + error = PyDict_GetItemRef(d, k, &value) == -1; +#endif + check(!error, "nanobind::detail::dict_get_item_ref_or_fail(): dictionary lookup failed!"); + return value; +} + NAMESPACE_END(detail) NAMESPACE_END(NB_NAMESPACE) diff --git a/src/nb_enum.cpp b/src/nb_enum.cpp index 4d4c11613..425008c8a 100644 --- a/src/nb_enum.cpp +++ b/src/nb_enum.cpp @@ -95,6 +95,8 @@ PyObject *enum_create(enum_init_data *ed) noexcept { #endif } + maybe_make_immortal(result.ptr()); + result.attr("__nb_enum__") = capsule(t, [](void *p) noexcept { type_init_data *t = (type_init_data *) p; delete (enum_map *) t->enum_tbl.fwd; diff --git a/src/nb_func.cpp b/src/nb_func.cpp index 3cc05cd51..56782ab52 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -82,15 +82,14 @@ void nb_func_dealloc(PyObject *self) { func_data *f = nb_func_data(self); // Delete from registered function list - nb_internals *internals_ = internals; - { - lock_internals guard(internals_); - size_t n_deleted = internals_->funcs.erase(self); - check(n_deleted == 1, - "nanobind::detail::nb_func_dealloc(\"%s\"): function not found!", - ((f->flags & (uint32_t) func_flags::has_name) ? f->name - : "")); - } +#if !defined(NB_FREE_THREADED) + size_t n_deleted = internals->funcs.erase(self); + check(n_deleted == 1, + "nanobind::detail::nb_func_dealloc(\"%s\"): function not found!", + ((f->flags & (uint32_t) func_flags::has_name) ? f->name + : "")); +#endif + for (size_t i = 0; i < size; ++i) { if (f->flags & (uint32_t) func_flags::has_free) f->free_capture(f->capture); @@ -351,10 +350,11 @@ PyObject *nb_func_new(const void *in_) noexcept { ((PyVarObject *) func_prev)->ob_size = 0; - lock_internals guard(internals_); +#if !defined(NB_FREE_THREADED) size_t n_deleted = internals_->funcs.erase(func_prev); check(n_deleted == 1, "nanobind::detail::nb_func_new(): internal update failed (1)!"); +#endif Py_CLEAR(func_prev); } @@ -364,15 +364,12 @@ PyObject *nb_func_new(const void *in_) noexcept { func->vectorcall = func->complex_call ? nb_func_vectorcall_complex : nb_func_vectorcall_simple; +#if !defined(NB_FREE_THREADED) // Register the function - nb_ptr_map::iterator it; - { - lock_internals guard(internals_); - bool success; - std::tie(it, success) = internals_->funcs.try_emplace(func, nullptr); - check(success, - "nanobind::detail::nb_func_new(): internal update failed (2)!"); - } + auto [it, success] = internals_->funcs.try_emplace(func, nullptr); + check(success, + "nanobind::detail::nb_func_new(): internal update failed (2)!"); +#endif func_data *fc = nb_func_data(func) + to_copy; memcpy(fc, f, sizeof(func_data_prelim<0>)); @@ -876,7 +873,7 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, } if (result != NB_NEXT_OVERLOAD) { - if (is_constructor) { + if (is_constructor && result != nullptr) { nb_inst *self_arg_nb = (nb_inst *) self_arg; self_arg_nb->destruct = true; self_arg_nb->state = nb_inst::state_ready; @@ -969,7 +966,7 @@ static PyObject *nb_func_vectorcall_simple(PyObject *self, } if (result != NB_NEXT_OVERLOAD) { - if (is_constructor) { + if (is_constructor && result != nullptr) { nb_inst *self_arg_nb = (nb_inst *) self_arg; self_arg_nb->destruct = true; self_arg_nb->state = nb_inst::state_ready; diff --git a/src/nb_internals.cpp b/src/nb_internals.cpp index 68ce3853a..3604464b3 100644 --- a/src/nb_internals.cpp +++ b/src/nb_internals.cpp @@ -250,13 +250,14 @@ static void internals_cleanup() { #if !defined(PYPY_VERSION) && !defined(NB_FREE_THREADED) /* The memory leak checker is unsupported on PyPy, see - see https://foss.heptapod.net/pypy/pypy/-/issues/3855 + see https://foss.heptapod.net/pypy/pypy/-/issues/3855. - It is explicitly disabled on free-threaded builds, which - immortalize function and type objects. - */ + Leak reporting is explicitly disabled on free-threaded builds + for now because of the decision to immortalize function and + type objects. This may change in the future. */ bool print_leak_warnings = p->print_leak_warnings; + size_t inst_leaks = 0, keep_alive_leaks = 0; // Shard locking no longer needed, Py_AtExit is single-threaded @@ -343,8 +344,12 @@ static void internals_cleanup() { } #if defined(NB_FREE_THREADED) + // This code won't run for now but is kept here for a time when + // immortalization isn't needed anymore. + PyThread_tss_delete(p->nb_static_property_disabled); PyThread_tss_free(p->nb_static_property_disabled); + delete[] p->shards; #endif delete p; @@ -356,7 +361,7 @@ static void internals_cleanup() { "counting issue in the binding code.\n"); } - #if defined(NB_ABORT_ON_LEAK) + #if defined(NB_ABORT_ON_LEAK) && !defined(NB_FREE_THREADED) abort(); // Extra-strict behavior for the CI server #endif } @@ -380,7 +385,9 @@ NB_NOINLINE void init(const char *name) { NB_INTERNALS_ID, name ? name : ""); check(key, "nanobind::detail::init(): could not create dictionary key!"); - PyObject *capsule = PyDict_GetItem(dict, key); + PyObject *capsule; + + capsule = dict_get_item_ref_or_fail(dict, key); if (capsule) { Py_DECREF(key); internals = (nb_internals *) PyCapsule_GetPointer(capsule, "nb_internals"); @@ -388,6 +395,7 @@ NB_NOINLINE void init(const char *name) { "nanobind::detail::internals_fetch(): capsule pointer is NULL!"); nb_meta_cache = internals->nb_meta; is_alive_ptr = internals->is_alive_ptr; + Py_DECREF(capsule); return; } diff --git a/src/nb_internals.h b/src/nb_internals.h index 2a6fc491b..fbe346d01 100644 --- a/src/nb_internals.h +++ b/src/nb_internals.h @@ -296,7 +296,8 @@ struct NB_SHARD_ALIGNMENT nb_shard { * concurrent append operations. We assume that this data structure is only * written during module initialization and don't use locking. * - * - `funcs`: data structure for function leak tracking. Protected by `mutex`. + * - `funcs`: data structure for function leak tracking. Not used in + * free-threaded mode . * * - `print_leak_warnings`, `print_implicit_cast_warnings`: simple boolean * flags. No protection against concurrent conflicting updates. @@ -352,8 +353,11 @@ struct nb_internals { /// C++ -> Python type map -- slow fallback version based on hashed strings nb_type_map_slow type_c2p_slow; +#if !defined(NB_FREE_THREADED) /// nb_func/meth instance map for leak reporting (used as set, the value is unused) + /// In free-threaded mode, functions are immortal and don't require this data structure. nb_ptr_map funcs; +#endif /// Registered C++ -> Python exception translators nb_translator_seq translators; diff --git a/src/nb_type.cpp b/src/nb_type.cpp index 7df7d3e71..9ed762016 100644 --- a/src/nb_type.cpp +++ b/src/nb_type.cpp @@ -379,7 +379,23 @@ void nb_type_unregister(type_data *t) noexcept { #if defined(NB_FREE_THREADED) // In free-threaded mode, stale type information remains in the - // 'type_c2p_fast' TLS. It is freed when the thread terminates. + // 'type_c2p_fast' TLS. This data structure is eventually deallocated + // when the thread terminates. + // + // In principle, this is dangerous because the user could delete a type + // binding from a module at runtime, causing the associated + // Python type object to be freed. If a function then attempts to return + // a value with such a de-registered type, nanobind should raise an + // exception, which requires knowing that the entry in 'type_c2p_fast' + // has become invalid in the meantime. + // + // Right now, this problem is avoided because we immortalize type objects in + // ``nb_type_new()`` and ``enum_create()``. However, we may not always + // want to stick with immortalization, which is just a workaround. + // + // In the future, a global version counter modified with acquire/release + // semantics (see https://github.com/wjakob/nanobind/pull/695#discussion_r1761600010) + // might prove to be a similarly efficient but more general solution. bool fail = n_del_slow != 1; #else nb_type_map_fast &type_c2p_fast = internals_->type_c2p_fast; @@ -812,13 +828,13 @@ static PyTypeObject *nb_type_tp(size_t supplement) noexcept { nb_internals *internals_ = internals; PyTypeObject *tp = - (PyTypeObject *) PyDict_GetItem(internals_->nb_type_dict, key.ptr()); + (PyTypeObject *) dict_get_item_ref_or_fail(internals_->nb_type_dict, key.ptr()); if (NB_UNLIKELY(!tp)) { // Retry in critical section to avoid races that create the same nb_type lock_internals guard(internals_); - tp = (PyTypeObject *) PyDict_GetItem(internals_->nb_type_dict, key.ptr()); + tp = (PyTypeObject *) dict_get_item_ref_or_fail(internals_->nb_type_dict, key.ptr()); if (tp) return tp; @@ -877,8 +893,6 @@ static PyTypeObject *nb_type_tp(size_t supplement) noexcept { if (tp) rv = PyDict_SetItem(internals_->nb_type_dict, key.ptr(), (PyObject *) tp); check(rv == 0, "nb_type type creation failed!"); - - Py_DECREF(tp); } return tp; @@ -1273,6 +1287,8 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { "failed: %s!", t_name, err.what()); } + Py_DECREF(metaclass); + maybe_make_immortal(result); type_data *to = nb_type_data((PyTypeObject *) result); From 0026b834d05fc4fe3729c2963dffe56a65a64870 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Mon, 16 Sep 2024 08:41:14 +0900 Subject: [PATCH 13/17] Make ``nb::dict`` iterator non-copyable This commit changes the ``nb::dict`` iterator so that nanobind can implement the recommendation from https://docs.python.org/3.14/howto/free-threading-extensions.html#pydict-next The primary goal of ``nb::internal::dict_iterator`` was to be able to write ```cpp nb::dict my_dict = /* ... */; for (auto [k, v] : my_dict) { // .... } ``` This in fact the only associated feature that is explicitly mentioned in the documentation, and this continues to work. However, some undocumented features are lost: - The dictionary iterator is no longer copyable. This is because it must acquire an exclusive lock to the underlying dictionary. - The pre-increment operator ``++dict_it`` (which relied on copying) is gone. Post-increment continues to work, and that is enough for the loop structure mentioned above. --- docs/api_core.rst | 4 ++++ docs/free_threaded.rst | 4 ++-- include/nanobind/nanobind.h | 2 +- include/nanobind/nb_call.h | 2 +- include/nanobind/nb_defs.h | 5 +++++ include/nanobind/nb_misc.h | 4 ---- include/nanobind/nb_types.h | 29 +++++++++++++++++++---------- 7 files changed, 32 insertions(+), 18 deletions(-) diff --git a/docs/api_core.rst b/docs/api_core.rst index 15b8e6492..fcb1a920d 100644 --- a/docs/api_core.rst +++ b/docs/api_core.rst @@ -790,6 +790,10 @@ Wrapper classes Return an item iterator that returns ``std::pair`` key-value pairs analogous to ``iter(dict.items())`` in Python. + In free-threaded Python, the :cpp:class:``detail::dict_iterator`` class + acquires a lock to the underlying dictionary to enable the use of the + efficient but thread-unsafe ``PyDict_Next()`` Python C traversal routine. + .. cpp:function:: detail::dict_iterator end() const Return a sentinel that ends the iteration. diff --git a/docs/free_threaded.rst b/docs/free_threaded.rst index 7acbb42c0..3e86f51ac 100644 --- a/docs/free_threaded.rst +++ b/docs/free_threaded.rst @@ -235,8 +235,8 @@ necessary based on future experience and changes in Python itself. Wrappers ________ -:ref:`Wrapper types ` like :cpp:class:`nb::list ` may be used in -multi-threaded code. Operations like :cpp:func:`nb::list::append() +:ref:`Wrapper types ` like :cpp:class:`nb::list ` may be used +in multi-threaded code. Operations like :cpp:func:`nb::list::append() ` internally acquire locks and behave just like their ordinary Python counterparts. This means that race conditions can still occur without larger-scale synchronization, but such races won't jeopardize the memory safety diff --git a/include/nanobind/nanobind.h b/include/nanobind/nanobind.h index a138ec4f8..71987856c 100644 --- a/include/nanobind/nanobind.h +++ b/include/nanobind/nanobind.h @@ -48,10 +48,10 @@ #include "nb_error.h" #include "nb_attr.h" #include "nb_cast.h" +#include "nb_misc.h" #include "nb_call.h" #include "nb_func.h" #include "nb_class.h" -#include "nb_misc.h" #if defined(_MSC_VER) # pragma warning(pop) diff --git a/include/nanobind/nb_call.h b/include/nanobind/nb_call.h index 25150b0be..b20bba9d8 100644 --- a/include/nanobind/nb_call.h +++ b/include/nanobind/nb_call.h @@ -65,7 +65,7 @@ NB_INLINE void call_init(PyObject **args, PyObject *kwnames, size_t &nargs, } else if constexpr (std::is_same_v) { PyObject *key, *entry; Py_ssize_t pos = 0; - + ft_object_guard guard(value); while (PyDict_Next(value.ptr(), &pos, &key, &entry)) { Py_INCREF(key); Py_INCREF(entry); args[kwargs_offset + nkwargs] = entry; diff --git a/include/nanobind/nb_defs.h b/include/nanobind/nb_defs.h index d6395e696..071e41f9f 100644 --- a/include/nanobind/nb_defs.h +++ b/include/nanobind/nb_defs.h @@ -173,6 +173,11 @@ # define NB_TYPE_GET_SLOT_IMPL 1 #endif +#define NB_NONCOPYABLE(X) \ + X(const X &) = delete; \ + X &operator=(const X &) = delete; + + #define NB_MODULE_IMPL(name) \ extern "C" [[maybe_unused]] NB_EXPORT PyObject *PyInit_##name(); \ extern "C" NB_EXPORT PyObject *PyInit_##name() diff --git a/include/nanobind/nb_misc.h b/include/nanobind/nb_misc.h index 50028c09d..b29d6b693 100644 --- a/include/nanobind/nb_misc.h +++ b/include/nanobind/nb_misc.h @@ -9,10 +9,6 @@ NAMESPACE_BEGIN(NB_NAMESPACE) -#define NB_NONCOPYABLE(X) \ - X(const X &) = delete; \ - X &operator=(const X &) = delete; - struct gil_scoped_acquire { public: NB_NONCOPYABLE(gil_scoped_acquire) diff --git a/include/nanobind/nb_types.h b/include/nanobind/nb_types.h index 68a16a2aa..d26455931 100644 --- a/include/nanobind/nb_types.h +++ b/include/nanobind/nb_types.h @@ -819,24 +819,29 @@ struct fast_iterator { class dict_iterator { public: + NB_NONCOPYABLE(dict_iterator) + using value_type = std::pair; using reference = const value_type; - dict_iterator() : h(), pos(-1) { } - + dict_iterator() = default; dict_iterator(handle h) : h(h), pos(0) { +#if defined(NB_FREE_THREADED) + PyCriticalSection_Begin(&cs, h.ptr()); +#endif increment(); } - dict_iterator& operator++() { - increment(); - return *this; +#if defined(NB_FREE_THREADED) + ~dict_iterator() { + if (h.ptr()) + PyCriticalSection_End(&cs); } +#endif - dict_iterator operator++(int) { - dict_iterator rv = *this; + dict_iterator& operator++() { increment(); - return rv; + return *this; } void increment() { @@ -851,8 +856,12 @@ class dict_iterator { private: handle h; - Py_ssize_t pos; - PyObject *key = nullptr, *value = nullptr; + Py_ssize_t pos = -1; + PyObject *key = nullptr; + PyObject *value = nullptr; +#if defined(NB_FREE_THREADED) + PyCriticalSection cs { }; +#endif }; NB_IMPL_COMP(equal, Py_EQ) From 29d8834fa5bd304a338e63cf4bc6f22145b1a08c Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Mon, 16 Sep 2024 09:05:12 +0900 Subject: [PATCH 14/17] Fix regressions, use new ``PyImport_AddModuleRef()`` API --- .github/workflows/ci.yml | 4 +-- cmake/darwin-ld-cpython.sym | 1 + src/common.cpp | 57 ++++++++++++++++++------------------- src/nb_internals.cpp | 6 ++-- 4 files changed, 32 insertions(+), 36 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c620ff24c..44b8624d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-2022', 'macos-13'] - python: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13.0-rc.1', 'pypy3.9-v7.3.16', 'pypy3.10-v7.3.17'] + python: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13.0-rc.2', 'pypy3.9-v7.3.16', 'pypy3.10-v7.3.17'] name: "Python ${{ matrix.python }} / ${{ matrix.os }}" runs-on: ${{ matrix.os }} @@ -49,7 +49,7 @@ jobs: python -m pip install pytest pytest-github-actions-annotate-failures typing_extensions - name: Install NumPy - if: ${{ !startsWith(matrix.python, 'pypy') && !contains(matrix.python, 'rc.1') }} + if: ${{ !startsWith(matrix.python, 'pypy') && !contains(matrix.python, 'rc.2') }} run: | python -m pip install numpy scipy diff --git a/cmake/darwin-ld-cpython.sym b/cmake/darwin-ld-cpython.sym index 6cd8c2ae4..850dd384b 100644 --- a/cmake/darwin-ld-cpython.sym +++ b/cmake/darwin-ld-cpython.sym @@ -922,3 +922,4 @@ -U _PyCriticalSection_End -U _PyCriticalSection2_Begin -U _PyCriticalSection2_End +-U _PyUnicode_AsUTF8 diff --git a/src/common.cpp b/src/common.cpp index 060be1511..02bc02ad5 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -168,53 +168,46 @@ PyObject *module_import(PyObject *o) { PyObject *module_new_submodule(PyObject *base, const char *name, const char *doc) noexcept { - PyObject *name_py, *res; + const char *base_name, *tmp_str; + Py_ssize_t tmp_size = 0; + object tmp, res; -#if !defined(PYPY_VERSION) - PyObject *base_name = PyModule_GetNameObject(base); + base_name = PyModule_GetName(base); if (!base_name) goto fail; - name_py = PyUnicode_FromFormat("%U.%s", base_name, name); - Py_DECREF(base_name); -#else - const char *base_name = PyModule_GetName(base); - if (!base_name) + tmp = steal(PyUnicode_FromFormat("%s.%s", base_name, name)); + if (!tmp.is_valid()) goto fail; - name_py = PyUnicode_FromFormat("%s.%s", base_name, name); -#endif - if (!name_py) + tmp_str = PyUnicode_AsUTF8AndSize(tmp.ptr(), &tmp_size); + if (!tmp_str) goto fail; -#if !defined(PYPY_VERSION) - res = PyImport_AddModuleObject(name_py); +#if PY_VERSION_HEX < 0x030D00A0 || defined(Py_LIMITED_API) + res = borrow(PyImport_AddModule(tmp_str)); #else - res = PyImport_AddModule(PyUnicode_AsUTF8(name_py)); + res = steal(PyImport_AddModuleRef(tmp_str)); #endif - Py_DECREF(name_py); - if (!res) + + if (!res.is_valid()) goto fail; if (doc) { - PyObject *doc_py = PyUnicode_FromString(doc); - if (!doc_py) + tmp = steal(PyUnicode_FromString(doc)); + if (!tmp.is_valid()) goto fail; - int rv = PyObject_SetAttrString(res, "__doc__", doc_py); - Py_DECREF(doc_py); - if (rv) + if (PyObject_SetAttrString(res.ptr(), "__doc__", tmp.ptr())) goto fail; } - Py_INCREF(res); // extra reference for PyModule_AddObject - - if (PyModule_AddObject(base, name, res)) { // steals on success - Py_DECREF(res); + res.inc_ref(); // For PyModule_AddObject, which steals upon success + if (PyModule_AddObject(base, name, res.ptr())) { + res.dec_ref(); goto fail; } - Py_INCREF(res); // turned borrowed into new reference - return res; + return res.release().ptr(); fail: raise_python_error(); @@ -1194,10 +1187,14 @@ void maybe_make_immortal(PyObject *op) { PyObject *dict_get_item_ref_or_fail(PyObject *d, PyObject *k) { PyObject *value; - bool error; -#if PY_VERSION_HEX < 0x030D0000 + bool error = false; + +#if PY_VERSION_HEX < 0x030D00A1 || defined(Py_LIMITED_API) value = PyDict_GetItemWithError(d, k); - error = !value && PyErr_Occurred(); + if (value) + Py_INCREF(value); + else + error = PyErr_Occurred(); #else error = PyDict_GetItemRef(d, k, &value) == -1; #endif diff --git a/src/nb_internals.cpp b/src/nb_internals.cpp index 3604464b3..0720223b8 100644 --- a/src/nb_internals.cpp +++ b/src/nb_internals.cpp @@ -323,7 +323,7 @@ static void internals_cleanup() { fprintf(stderr, "nanobind: leaked %zu functions!\n", p->funcs.size()); int ctr = 0; - for (auto [f, p] : p->funcs) { + for (auto [f, p2] : p->funcs) { fprintf(stderr, " - leaked function \"%s\"\n", nb_func_data(f)->name); if (ctr++ == 10) { @@ -385,9 +385,7 @@ NB_NOINLINE void init(const char *name) { NB_INTERNALS_ID, name ? name : ""); check(key, "nanobind::detail::init(): could not create dictionary key!"); - PyObject *capsule; - - capsule = dict_get_item_ref_or_fail(dict, key); + PyObject *capsule = dict_get_item_ref_or_fail(dict, key); if (capsule) { Py_DECREF(key); internals = (nb_internals *) PyCapsule_GetPointer(capsule, "nb_internals"); From ec77413709a80415c4e4492b8fdb921d211052a0 Mon Sep 17 00:00:00 2001 From: Joshua Oreman Date: Thu, 19 Sep 2024 00:20:09 +0100 Subject: [PATCH 15/17] Lock function arguments at compile time (#720) This commit refactors argument the locking locking so that it occurs at compile-time without imposing runtime overheads. The change applies to free-threaded extensions. Behavior differences compared to the prior approach: - it is no longer possible to do ``nb::arg().lock(false)`` or ``.lock(runtime_determined_value)`` - we no longer prohibit locking self in ``__init__``; changing this would also require restoring ``cast_flags::lock``, and it's not clear that the benefit outweighs the complexity. --- docs/api_core.rst | 12 +++-- include/nanobind/nb_attr.h | 102 ++++++++++++++++++++++++++++++------- include/nanobind/nb_call.h | 3 ++ include/nanobind/nb_cast.h | 3 ++ include/nanobind/nb_func.h | 58 ++++++++++++++++++++- src/nb_func.cpp | 90 -------------------------------- 6 files changed, 152 insertions(+), 116 deletions(-) diff --git a/docs/api_core.rst b/docs/api_core.rst index fcb1a920d..e8eaabb08 100644 --- a/docs/api_core.rst +++ b/docs/api_core.rst @@ -1620,7 +1620,9 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, .. cpp:function:: template arg_v operator=(T &&value) const - Assign a default value to the argument. + Return an argument annotation that is like this one but also assigns a + default value to the argument. The default will be converted into a Python + object immediately, so its bindings must have already been defined. .. cpp:function:: arg &none(bool value = true) @@ -1642,11 +1644,11 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, explain it in docstrings and stubs (``str(value)``) does not produce acceptable output. - .. cpp:function:: arg &lock(bool value = true) + .. cpp:function:: arg_locked lock() - Set a flag noting that this argument must be locked when dispatching a - function call in free-threaded Python extensions. It does nothing in - regular GIL-protected extensions. + Return an argument annotation that is like this one but also requests that + this argument be locked when dispatching a function call in free-threaded + Python extensions. It does nothing in regular GIL-protected extensions. .. cpp:struct:: is_method diff --git a/include/nanobind/nb_attr.h b/include/nanobind/nb_attr.h index 36a2dfb59..0476824ee 100644 --- a/include/nanobind/nb_attr.h +++ b/include/nanobind/nb_attr.h @@ -20,9 +20,17 @@ struct name { }; struct arg_v; +struct arg_locked; +struct arg_locked_v; + +// Basic function argument descriptor (no default value, not locked) struct arg { NB_INLINE constexpr explicit arg(const char *name = nullptr) : name_(name), signature_(nullptr) { } + + // operator= can be used to provide a default value template NB_INLINE arg_v operator=(T &&value) const; + + // Mutators that don't change default value or locked state NB_INLINE arg &noconvert(bool value = true) { convert_ = !value; return *this; @@ -31,29 +39,75 @@ struct arg { none_ = value; return *this; } - - NB_INLINE arg &lock(bool value = true) { - lock_ = value; - return *this; - } - NB_INLINE arg &sig(const char *value) { signature_ = value; return *this; } + // After lock(), this argument is locked + NB_INLINE arg_locked lock(); + const char *name_, *signature_; uint8_t convert_{ true }; bool none_{ false }; - bool lock_{ false }; }; +// Function argument descriptor with default value (not locked) struct arg_v : arg { object value; NB_INLINE arg_v(const arg &base, object &&value) : arg(base), value(std::move(value)) {} + + private: + // Inherited mutators would slice off the default, and are not generally needed + using arg::noconvert; + using arg::none; + using arg::sig; + using arg::lock; }; +// Function argument descriptor that is locked (no default value) +struct arg_locked : arg { + NB_INLINE constexpr explicit arg_locked(const char *name = nullptr) : arg(name) { } + NB_INLINE constexpr explicit arg_locked(const arg &base) : arg(base) { } + + // operator= can be used to provide a default value + template NB_INLINE arg_locked_v operator=(T &&value) const; + + // Mutators must be respecified in order to not slice off the locked status + NB_INLINE arg_locked &noconvert(bool value = true) { + convert_ = !value; + return *this; + } + NB_INLINE arg_locked &none(bool value = true) { + none_ = value; + return *this; + } + NB_INLINE arg_locked &sig(const char *value) { + signature_ = value; + return *this; + } + + // Redundant extra lock() is allowed + NB_INLINE arg_locked &lock() { return *this; } +}; + +// Function argument descriptor that is potentially locked and has a default value +struct arg_locked_v : arg_locked { + object value; + NB_INLINE arg_locked_v(const arg_locked &base, object &&value) + : arg_locked(base), value(std::move(value)) {} + + private: + // Inherited mutators would slice off the default, and are not generally needed + using arg_locked::noconvert; + using arg_locked::none; + using arg_locked::sig; + using arg_locked::lock; +}; + +NB_INLINE arg_locked arg::lock() { return arg_locked{*this}; } + template struct call_guard { using type = detail::tuple; }; @@ -133,9 +187,7 @@ enum class func_flags : uint32_t { /// Does this overload specify a custom function signature (for docstrings, typing) has_signature = (1 << 16), /// Does this function have one or more nb::keep_alive() annotations? - has_keep_alive = (1 << 17), - /// Free-threaded Python: does the binding lock the 'self' argument - lock_self = (1 << 18) + has_keep_alive = (1 << 17) }; enum cast_flags : uint8_t { @@ -148,14 +200,11 @@ enum cast_flags : uint8_t { // Indicates that the function dispatcher should accept 'None' arguments accepts_none = (1 << 2), - // Indicates that a function argument must be locked before dispatching a call - lock = (1 << 3), - // Indicates that this cast is performed by nb::cast or nb::try_cast. // This implies that objects added to the cleanup list may be // released immediately after the caster's final output value is // obtained, i.e., before it is used. - manual = (1 << 4) + manual = (1 << 3) }; @@ -300,8 +349,6 @@ NB_INLINE void func_extra_apply(F &f, const arg &a, size_t &index) { flag |= (uint8_t) cast_flags::accepts_none; if (a.convert_) flag |= (uint8_t) cast_flags::convert; - if (a.lock_) - flag |= (uint8_t) cast_flags::lock; arg_data &arg = f.args[index]; arg.flag = flag; @@ -310,6 +357,8 @@ NB_INLINE void func_extra_apply(F &f, const arg &a, size_t &index) { arg.value = nullptr; index++; } +// arg_locked will select the arg overload; the locking is added statically +// in nb_func.h template NB_INLINE void func_extra_apply(F &f, const arg_v &a, size_t &index) { @@ -317,14 +366,18 @@ NB_INLINE void func_extra_apply(F &f, const arg_v &a, size_t &index) { func_extra_apply(f, (const arg &) a, index); ad.value = a.value.ptr(); } +template +NB_INLINE void func_extra_apply(F &f, const arg_locked_v &a, size_t &index) { + arg_data &ad = f.args[index]; + func_extra_apply(f, (const arg_locked &) a, index); + ad.value = a.value.ptr(); +} template NB_INLINE void func_extra_apply(F &, kw_only, size_t &) {} template -NB_INLINE void func_extra_apply(F &f, lock_self, size_t &) { - f.flags |= (uint32_t) func_flags::lock_self; -} +NB_INLINE void func_extra_apply(F &, lock_self, size_t &) {} template NB_INLINE void func_extra_apply(F &, call_guard, size_t &) {} @@ -337,6 +390,7 @@ NB_INLINE void func_extra_apply(F &f, nanobind::keep_alive, size template struct func_extra_info { using call_guard = void; static constexpr bool keep_alive = false; + static constexpr size_t nargs_locked = 0; }; template struct func_extra_info @@ -354,6 +408,16 @@ struct func_extra_info, Ts...> : func_extra static constexpr bool keep_alive = true; }; +template +struct func_extra_info : func_extra_info { + static constexpr size_t nargs_locked = 1 + func_extra_info::nargs_locked; +}; + +template +struct func_extra_info : func_extra_info { + static constexpr size_t nargs_locked = 1 + func_extra_info::nargs_locked; +}; + template NB_INLINE void process_keep_alive(PyObject **, PyObject *, T *) { } diff --git a/include/nanobind/nb_call.h b/include/nanobind/nb_call.h index b20bba9d8..dfbeb45e8 100644 --- a/include/nanobind/nb_call.h +++ b/include/nanobind/nb_call.h @@ -35,6 +35,9 @@ args_proxy api::operator*() const { template NB_INLINE void call_analyze(size_t &nargs, size_t &nkwargs, const T &value) { using D = std::decay_t; + static_assert(!std::is_base_of_v, + "nb::arg().lock() may be used only when defining functions, " + "not when calling them"); if constexpr (std::is_same_v) nkwargs++; diff --git a/include/nanobind/nb_cast.h b/include/nanobind/nb_cast.h index 7001e4ac0..74ad6cefc 100644 --- a/include/nanobind/nb_cast.h +++ b/include/nanobind/nb_cast.h @@ -634,6 +634,9 @@ tuple make_tuple(Args &&...args) { template arg_v arg::operator=(T &&value) const { return arg_v(*this, cast((detail::forward_t) value)); } +template arg_locked_v arg_locked::operator=(T &&value) const { + return arg_locked_v(*this, cast((detail::forward_t) value)); +} template template detail::accessor& detail::accessor::operator=(T &&value) { diff --git a/include/nanobind/nb_func.h b/include/nanobind/nb_func.h index 226840502..1cd98a1b5 100644 --- a/include/nanobind/nb_func.h +++ b/include/nanobind/nb_func.h @@ -32,9 +32,40 @@ bool from_python_keep_alive(Caster &c, PyObject **args, uint8_t *args_flags, template constexpr size_t count_args_before_index(std::index_sequence) { static_assert(sizeof...(Is) == sizeof...(Ts)); - return ((Is < I && (std::is_same_v || std::is_same_v)) + ... + 0); + return ((Is < I && std::is_base_of_v) + ... + 0); } +#if defined(NB_FREE_THREADED) +struct ft_args_collector { + PyObject **args; + handle h1; + handle h2; + size_t index = 0; + + NB_INLINE explicit ft_args_collector(PyObject **a) : args(a) {} + NB_INLINE void apply(arg_locked *) { + if (h1.ptr() == nullptr) + h1 = args[index]; + h2 = args[index]; + ++index; + } + NB_INLINE void apply(arg *) { ++index; } + NB_INLINE void apply(...) {} +}; + +struct ft_args_guard { + NB_INLINE void lock(const ft_args_collector& info) { + PyCriticalSection2_Begin(&cs, info.h1.ptr(), info.h2.ptr()); + } + ~ft_args_guard() { + PyCriticalSection2_End(&cs); + } + PyCriticalSection2 cs; +}; +#endif + +struct no_guard {}; + template NB_INLINE PyObject *func_create(Func &&func, Return (*)(Args...), @@ -66,13 +97,21 @@ NB_INLINE PyObject *func_create(Func &&func, Return (*)(Args...), // Determine the number of nb::arg/nb::arg_v annotations constexpr size_t nargs_provided = - ((std::is_same_v + std::is_same_v) + ... + 0); + (std::is_base_of_v + ... + 0); constexpr bool is_method_det = (std::is_same_v + ... + 0) != 0; constexpr bool is_getter_det = (std::is_same_v + ... + 0) != 0; constexpr bool has_arg_annotations = nargs_provided > 0 && !is_getter_det; + // Determine the number of potentially-locked function arguments + constexpr bool lock_self_det = + (std::is_same_v + ... + 0) != 0; + static_assert(Info::nargs_locked <= 2, + "At most two function arguments can be locked"); + static_assert(!(lock_self_det && !is_method_det), + "The nb::lock_self() annotation only applies to methods"); + // Detect location of nb::kw_only annotation, if supplied. As with args/kwargs // we find the first and last location and later verify they match each other. // Note this is an index in Extra... while args/kwargs_pos_* are indices in @@ -187,6 +226,21 @@ NB_INLINE PyObject *func_create(Func &&func, Return (*)(Args...), tuple...> in; (void) in; +#if defined(NB_FREE_THREADED) + std::conditional_t guard; + if constexpr (Info::nargs_locked) { + ft_args_collector collector{args}; + if constexpr (is_method_det) { + if constexpr (lock_self_det) + collector.apply((arg_locked *) nullptr); + else + collector.apply((arg *) nullptr); + } + (collector.apply((Extra *) nullptr), ...); + guard.lock(collector); + } +#endif + if constexpr (Info::keep_alive) { if ((!from_python_keep_alive(in.template get(), args, args_flags, cleanup, Is) || ...)) diff --git a/src/nb_func.cpp b/src/nb_func.cpp index 56782ab52..c92179374 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -187,50 +187,6 @@ char *strdup_check(const char *s) { return result; } -#if defined(NB_FREE_THREADED) -// Locked function wrapper for free-threaded Python -struct locked_func { - void *capture[3]; - size_t nargs; - PyObject *(*impl)(void *, PyObject **, uint8_t *, rv_policy, - cleanup_list *); - void (*free_capture)(void *); -}; - -static void locked_func_free(void *p) { - locked_func *f = (locked_func *) ((void **) p)[0]; - if (f->free_capture) - f->free_capture(f->capture); - PyMem_Free(f); -} - -static PyObject *locked_func_impl(void *p, PyObject **args, uint8_t *args_flags, - rv_policy policy, cleanup_list *cleanup) { - handle h1, h2; - size_t ctr = 0; - - locked_func *f = (locked_func *) ((void **) p)[0]; - - for (size_t i = 0; i < f->nargs; ++i) { - if (args_flags[i] & (uint8_t) cast_flags::lock) { - if (ctr == 0) - h1 = args[i]; - h2 = args[i]; - ctr++; - } - } - -#if !defined(NDEBUG) - // nb_func_new ensured that at most two arguments are locked, but - // can't hurt to check once more in debug builds - check(ctr == 1 || ctr == 2, "locked_call: expected 1 or 2 locked arguments!"); -#endif - - ft_object2_guard guard(h1, h2); - return f->impl(f->capture, args, args_flags, policy, cleanup); -} -#endif - /** * \brief Wrap a C++ function into a Python function object * @@ -249,7 +205,6 @@ PyObject *nb_func_new(const void *in_) noexcept { is_implicit = f->flags & (uint32_t) func_flags::is_implicit, is_method = f->flags & (uint32_t) func_flags::is_method, return_ref = f->flags & (uint32_t) func_flags::return_ref, - lock_self = f->flags & (uint32_t) func_flags::lock_self, is_constructor = false, is_init = false, is_new = false, @@ -419,7 +374,6 @@ PyObject *nb_func_new(const void *in_) noexcept { } } - size_t lock_count = 0; if (has_args) { fc->args = (arg_data *) malloc_check(sizeof(arg_data) * f->nargs); @@ -438,55 +392,11 @@ PyObject *nb_func_new(const void *in_) noexcept { } if (a.value == Py_None) a.flag |= (uint8_t) cast_flags::accepts_none; - if (a.flag & (uint8_t) cast_flags::lock) - lock_count++; a.signature = a.signature ? strdup_check(a.signature) : nullptr; Py_XINCREF(a.value); } } - if (lock_self) { - check(is_method && !is_init, - "nb::detail::nb_func_new(\"%s\"): the nb::lock_self annotation only " - "applies to regular methods.", name_cstr); - -#if defined(NB_FREE_THREADED) - // Must potentially allocate dummy 'args' if 'lock_self' is set - if (!has_args) { - fc->args = (arg_data *) malloc_check(sizeof(arg_data) * f->nargs); - memset(fc->args, 0, sizeof(arg_data) * f->nargs); - for (uint32_t i = 1; i < f->nargs; ++i) - fc->args[i].flag &= (uint8_t) cast_flags::convert; - func->vectorcall = nb_func_vectorcall_complex; - fc->flags |= (uint32_t) func_flags::has_args; - has_args = true; - } - - fc->args[0].flag |= (uint8_t) cast_flags::lock; -#endif - - lock_count++; - } - - check(lock_count <= 2, - "nb::detail::nb_func_new(\"%s\"): at most two function arguments can " - "be locked.", name_cstr); - -#if defined(NB_FREE_THREADED) - if (lock_count) { - locked_func *lf = (locked_func *) PyMem_Malloc(sizeof(locked_func)); - check(lf, "nb::detail::nb_func_new(\"%s\"): locked function alloc. failed.", name_cstr); - memcpy(lf->capture, fc->capture, sizeof(func_data::capture)); - lf->nargs = fc->nargs; - lf->impl = fc->impl; - lf->free_capture = (fc->flags & (uint32_t) func_flags::has_free) ? fc->free_capture : nullptr; - fc->impl = locked_func_impl; - fc->free_capture = locked_func_free; - fc->flags |= (uint32_t) func_flags::has_free; - fc->capture[0] = lf; - } -#endif - // Fast path for vector call object construction if (((is_init && is_method) || (is_new && !is_method)) && nb_type_check(f->scope)) { From 53e1e81d8cb5a7e9b5af0249598944e1dd2e7ac4 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Fri, 20 Sep 2024 22:10:39 +0900 Subject: [PATCH 16/17] relax the conditions when the complex function dispatch loop is used --- src/nb_func.cpp | 27 ++++++++++++++++++--------- tests/test_thread.cpp | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/nb_func.cpp b/src/nb_func.cpp index c92179374..5c1820c13 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -199,6 +199,8 @@ PyObject *nb_func_new(const void *in_) noexcept { bool has_scope = f->flags & (uint32_t) func_flags::has_scope, has_name = f->flags & (uint32_t) func_flags::has_name, has_args = f->flags & (uint32_t) func_flags::has_args, + has_var_args = f->flags & (uint32_t) func_flags::has_var_kwargs, + has_var_kwargs = f->flags & (uint32_t) func_flags::has_var_args, has_keep_alive = f->flags & (uint32_t) func_flags::has_keep_alive, has_doc = f->flags & (uint32_t) func_flags::has_doc, has_signature = f->flags & (uint32_t) func_flags::has_signature, @@ -289,13 +291,20 @@ PyObject *nb_func_new(const void *in_) noexcept { maybe_make_immortal((PyObject *) func); - func->max_nargs = f->nargs; - func->complex_call = f->nargs_pos < f->nargs || has_args || has_keep_alive; + // Check if the complex dispatch loop is needed + bool complex_call = has_keep_alive || has_var_kwargs || has_var_args || f->nargs >= NB_MAXARGS_SIMPLE; + if (has_args) { + for (size_t i = 0; i < f->nargs; ++i) { + arg_data &a = f->args[i]; + complex_call |= a.name != nullptr || a.value != nullptr || + a.flag != cast_flags::convert; + } + } + uint32_t max_nargs = f->nargs; if (func_prev) { - func->complex_call |= ((nb_func *) func_prev)->complex_call; - func->max_nargs = std::max(func->max_nargs, - ((nb_func *) func_prev)->max_nargs); + complex_call |= ((nb_func *) func_prev)->complex_call; + max_nargs = std::max(max_nargs, ((nb_func *) func_prev)->max_nargs); func_data *cur = nb_func_data(func), *prev = nb_func_data(func_prev); @@ -314,10 +323,10 @@ PyObject *nb_func_new(const void *in_) noexcept { Py_CLEAR(func_prev); } - func->complex_call |= func->max_nargs >= NB_MAXARGS_SIMPLE; - - func->vectorcall = func->complex_call ? nb_func_vectorcall_complex - : nb_func_vectorcall_simple; + func->max_nargs = max_nargs; + func->complex_call = complex_call; + func->vectorcall = complex_call ? nb_func_vectorcall_complex + : nb_func_vectorcall_simple; #if !defined(NB_FREE_THREADED) // Register the function diff --git a/tests/test_thread.cpp b/tests/test_thread.cpp index 4ac012943..230e1285a 100644 --- a/tests/test_thread.cpp +++ b/tests/test_thread.cpp @@ -27,7 +27,7 @@ NB_MODULE(test_thread_ext, m) { m.def("inc_safe", [](Counter &c) { c.inc(); }, - "counter"_a.lock()); + nb::arg().lock()); m.def("inc_global", [](Counter &c) { From a1a055ad0294e655d946907b30f51a88deb0f891 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Fri, 20 Sep 2024 22:51:53 +0900 Subject: [PATCH 17/17] free-threading: final tweaks --- docs/api_core.rst | 8 ++++++-- docs/free_threaded.rst | 27 ++++++++++++++++++--------- docs/functions.rst | 27 ++++++++++++++++++++------- src/nb_internals.cpp | 2 +- 4 files changed, 45 insertions(+), 19 deletions(-) diff --git a/docs/api_core.rst b/docs/api_core.rst index e8eaabb08..2a480dd56 100644 --- a/docs/api_core.rst +++ b/docs/api_core.rst @@ -1668,6 +1668,12 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, Indicate that the bound constructor can be used to perform implicit conversions. +.. cpp:struct:: lock_self + + Indicate that the implicit ``self`` argument of a method should be locked + when dispatching a call in a free-threaded extension. This annotation does + nothing in regular GIL-protected extensions. + .. cpp:struct:: template call_guard Invoke the call guard(s) `Ts` when the bound function executes. The RAII @@ -1887,10 +1893,8 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, .. cpp:struct:: template for_setter - Analogous to :cpp:struct:`for_getter`, but for setters. - .. _class_binding_annotations: Class binding annotations diff --git a/docs/free_threaded.rst b/docs/free_threaded.rst index 3e86f51ac..af2bfa871 100644 --- a/docs/free_threaded.rst +++ b/docs/free_threaded.rst @@ -153,8 +153,8 @@ To do so, call the :cpp:func:`.lock() ` member of :cpp:class:`nb::arg() ` annotations to indicate that an argument must be locked, e.g.: -- ``nb::arg("my_parameter").lock()`` -- ``"my_parameter"_a.lock()`` (short-hand form) +- :cpp:func:`nb::arg("my_parameter").lock() ` +- :cpp:func:`"my_parameter"_a.lock() ` (short-hand form) In methods bindings, pass :cpp:struct:`nb::lock_self() ` to lock the implicit ``self`` argument. Note that at most 2 arguments can be @@ -189,11 +189,19 @@ larger-scale redesign of your project may be in order. .. note:: - Adding locking annotations indiscriminately is inadvisable because they add - a runtime cost to function call dispatcher. - The :cpp:func:`.lock() ` annotation is ignored in GIL-protected - builds. Note that listing arguments in function bindings generally comes at - a small cost in terms of :ref:`binding overheads `. + Adding locking annotations indiscriminately is inadvisable because locked + calls are more costly than unlocked ones. The :cpp:func:`.lock() + ` and :cpp:struct:`nb::lock_self() ` annotations are + ignored in GIL-protected builds, hence this added cost only applies to + free-threaded extensions. + + Furthermore, when adding locking annotations to a function, consider keeping + the arguments *unnamed* (i.e., :cpp:func:`nb::arg().lock() ` + instead of :cpp:func:`nb::arg("name").lock() `) if the function + will never be called with keyword arguments. Processing named arguments + causes small :ref:`binding overheads ` that may be + undesirable if a function that does very little is called at a very high + rate. .. note:: @@ -269,8 +277,9 @@ high rate. Similar to free-threaded Python itself, nanobind avoids this bottleneck by *immortalizing* functions (``nanobind.nb_func``, ``nanobind.nb_method``) and -type bindings. Immortal objects don't require reference counting. In turn, the -downside is that they leak when the interpreter shuts down. Free-threaded +type bindings. Immortal objects don't require reference counting and therefore +cannot cause the bottleneck mentioned above. The main downside of this approach +is that these objects leak when the interpreter shuts down. Free-threaded nanobind extensions disable the internal :ref:`leak checker `, since it would produce many warning messages caused by immortal objects. diff --git a/docs/functions.rst b/docs/functions.rst index 5061a945c..8c364378f 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -567,16 +567,23 @@ In the snippet below, ``f1`` has lower binding overheads compared to ``f2``. This is because ``f1``: -- does not accept keyword arguments and does not specify :ref:`default argument - values `. +1. Does *not* use any of the following advanced argument annotations features: -- has no :cpp:class:`nb::keep_alive\() ` or - :ref:`argument locking ` annotations. + - **Named function arguments**, e.g., :cpp:class:`nb::arg("name") ` or ``"name"_a``. -- takes no variable-length positional (:cpp:class:`nb::args `) or keyword - (:cpp:class:`nb::kwargs `) arguments. + - **Default argument values**, e.g., :cpp:func:`nb::arg() = 0 ` or ``"name"_a = false``. -- has 8 or fewer arguments. + - **Nullability** or **implicit conversion** flags, e.g., + :cpp:func:`nb::arg().none() ` or :cpp:func:`"name"_a.noconvert() + `. + +2. Has no :cpp:class:`nb::keep_alive\() ` + annotations. + +3. Takes no variable-length positional (:cpp:class:`nb::args `) or keyword + (:cpp:class:`nb::kwargs `) arguments. + +4. Has a to total of **8 or fewer** function arguments. If all of the above conditions are satisfied, nanobind switches to a specialized dispatcher that is optimized to handle a small number of positional @@ -587,3 +594,9 @@ execute more slowly, since nanobind must first select a suitable one. These differences are mainly of interest when a function that does *very little* is called at a *very high rate*, in which case binding overheads can become noticeable. + +Regarding point 1 of the above list, note that **locking** is okay, as long as +the annotation does not provide an argument name. In other words, a function +binding with a :cpp:func:`nb::arg().lock() ` for some of its arguments stays on the fast +path. This is mainly of interest for :ref:`free-threaded ` +extensions. diff --git a/src/nb_internals.cpp b/src/nb_internals.cpp index 0720223b8..b4a3f1d9c 100644 --- a/src/nb_internals.cpp +++ b/src/nb_internals.cpp @@ -87,7 +87,7 @@ #define NB_INTERNALS_ID \ "v" NB_TOSTRING(NB_INTERNALS_VERSION) \ - NB_COMPILER_TYPE NB_STDLIB NB_VERSION_DEV_STR NB_BUILD_ABI \ + NB_VERSION_DEV_STR NB_COMPILER_TYPE NB_STDLIB NB_BUILD_ABI \ NB_BUILD_TYPE NB_STABLE_ABI NB_FREE_THREADED_ABI NAMESPACE_BEGIN(NB_NAMESPACE)