From 544adf21284fee445f170a279e7b54a138d46149 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Wed, 24 Jan 2024 20:27:56 +0000 Subject: [PATCH 1/8] gh-110481: Implement inter-thread queue for biased reference counting --- Include/internal/pycore_brc.h | 74 +++++++ Include/internal/pycore_ceval.h | 1 + Include/internal/pycore_interp.h | 1 + Include/internal/pycore_object_stack.h | 6 + Include/internal/pycore_tstate.h | 2 + Lib/test/test_code.py | 1 + Lib/test/test_concurrent_futures/executor.py | 17 +- .../test_process_pool.py | 1 + Makefile.pre.in | 2 + Modules/posixmodule.c | 4 + Objects/dictobject.c | 3 +- Objects/object.c | 8 +- PCbuild/_freeze_module.vcxproj | 1 + PCbuild/_freeze_module.vcxproj.filters | 3 + PCbuild/pythoncore.vcxproj | 2 + PCbuild/pythoncore.vcxproj.filters | 6 + Python/brc.c | 183 ++++++++++++++++++ Python/ceval_gil.c | 8 + Python/gc_free_threading.c | 43 +++- Python/object_stack.c | 21 ++ Python/pystate.c | 11 ++ 21 files changed, 387 insertions(+), 11 deletions(-) create mode 100644 Include/internal/pycore_brc.h create mode 100644 Python/brc.c diff --git a/Include/internal/pycore_brc.h b/Include/internal/pycore_brc.h new file mode 100644 index 00000000000000..c7a7c47eb1f323 --- /dev/null +++ b/Include/internal/pycore_brc.h @@ -0,0 +1,74 @@ +#ifndef Py_INTERNAL_BRC_H +#define Py_INTERNAL_BRC_H + +#include +#include "pycore_llist.h" // struct llist_node +#include "pycore_lock.h" // PyMutex +#include "pycore_object_stack.h" // _PyObjectStack + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#ifdef Py_GIL_DISABLED + +// Prime number to avoid correlations with memory addresses. +#define _Py_BRC_NUM_BUCKETS 257 + +// Hash table bucket +struct _brc_bucket { + // Mutex protects both the bucket and thread queues in this bucket. + PyMutex mutex; + + // Linked list of _PyThreadStateImpl objects hashed to this bucket. + struct llist_node root; +}; + +// Per-interpreter biased reference counting state +struct _brc_state { + // Hash table of thread states by thread-id. Threads within a bucket are + // chained using a doubly-linked list. + struct _brc_bucket table[_Py_BRC_NUM_BUCKETS]; +}; + +// Per-thread biased reference counting state +struct _brc_thread_state { + // Linked-list of thread states per hash bucket + struct llist_node bucket_node; + + // Thread-id as determined by _PyThread_Id() + uintptr_t tid; + + // Objects with refcounts to be merged (protected by bucket mutex) + _PyObjectStack objects_to_merge; + + // Local stack of objects to be merged (not accessed by other threads) + _PyObjectStack local_objects_to_merge; +}; + +// Initialize/finalize the per-thread biased reference counting state +void _Py_brc_init_thread(PyThreadState *tstate); +void _Py_brc_remove_thread(PyThreadState *tstate); + +// Initialize per-interpreter state +void _Py_brc_init_state(PyInterpreterState *interp); + +void _Py_brc_after_fork(PyInterpreterState *interp); + +// Enqueues an object to be merged by it's owning thread (tid). This +// steals a reference to the object. +void _Py_brc_queue_object(PyObject *ob); + +// Merge the refcounts of queued objects for the current thread. +void _Py_brc_merge_refcounts(PyThreadState *tstate); + +#endif /* Py_GIL_DISABLED */ + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_BRC_H */ diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index a66af1389541dd..b158fc9ff5ebc1 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -206,6 +206,7 @@ void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame) #define _PY_ASYNC_EXCEPTION_BIT 3 #define _PY_GC_SCHEDULED_BIT 4 #define _PY_EVAL_PLEASE_STOP_BIT 5 +#define _PY_EVAL_EXPLICIT_MERGE_BIT 6 /* Reserve a few bits for future use */ #define _PY_EVAL_EVENTS_BITS 8 diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 04e75940dcb573..804a5a31ccb546 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -201,6 +201,7 @@ struct _is { #if defined(Py_GIL_DISABLED) struct _mimalloc_interp_state mimalloc; + struct _brc_state brc; // biased reference counting state #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Include/internal/pycore_object_stack.h b/Include/internal/pycore_object_stack.h index 1dc1c1591525de..8c2f9001efda62 100644 --- a/Include/internal/pycore_object_stack.h +++ b/Include/internal/pycore_object_stack.h @@ -32,6 +32,8 @@ _PyObjectStackChunk_New(void); extern void _PyObjectStackChunk_Free(_PyObjectStackChunk *); +typedef struct _Py_freelist_state _PyFreeListState; + extern void _PyObjectStackChunk_ClearFreeList(_PyFreeListState *state, int is_finalization); @@ -74,6 +76,10 @@ _PyObjectStack_Pop(_PyObjectStack *stack) return obj; } +// Merge src into dst, leaving src empty +extern void +_PyObjectStack_Merge(_PyObjectStack *dst, _PyObjectStack *src); + // Remove all items from the stack extern void _PyObjectStack_Clear(_PyObjectStack *stack); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 472fa08154e8f9..77a1dc59163d21 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -10,6 +10,7 @@ extern "C" { #include "pycore_freelist.h" // struct _Py_freelist_state #include "pycore_mimalloc.h" // struct _mimalloc_thread_state +#include "pycore_brc.h" // struct _brc_thread_state // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The @@ -22,6 +23,7 @@ typedef struct _PyThreadStateImpl { #ifdef Py_GIL_DISABLED struct _mimalloc_thread_state mimalloc; struct _Py_freelist_state freelist_state; + struct _brc_thread_state brc; #endif } _PyThreadStateImpl; diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py index d8fb826edeb681..46bebfc7af675b 100644 --- a/Lib/test/test_code.py +++ b/Lib/test/test_code.py @@ -865,6 +865,7 @@ def __init__(self, f, test): self.test = test def run(self): del self.f + gc_collect() self.test.assertEqual(LAST_FREED, 500) SetExtra(f.__code__, FREE_INDEX, ctypes.c_voidp(500)) diff --git a/Lib/test/test_concurrent_futures/executor.py b/Lib/test/test_concurrent_futures/executor.py index 1e7d4344740943..6a79fe69ec37cf 100644 --- a/Lib/test/test_concurrent_futures/executor.py +++ b/Lib/test/test_concurrent_futures/executor.py @@ -1,8 +1,10 @@ import threading import time +import unittest import weakref from concurrent import futures from test import support +from test.support import Py_GIL_DISABLED def mul(x, y): @@ -83,10 +85,21 @@ def test_no_stale_references(self): my_object_collected = threading.Event() my_object_callback = weakref.ref( my_object, lambda obj: my_object_collected.set()) - # Deliberately discarding the future. - self.executor.submit(my_object.my_method) + fut = self.executor.submit(my_object.my_method) del my_object + if Py_GIL_DISABLED: + # Due to biased reference counting, my_object might only be + # deallocated while the thread that created it runs -- if the + # thread is paused waiting on an event, it may not merge the + # refcount of the queued object. For that reason, we wait for the + # task to finish (so that it's no longer referenced) and force a + # GC to ensure that it is collected. + fut.result() # Wait for the task to finish. + support.gc_collect() + else: + del fut # Deliberately discard the future. + collected = my_object_collected.wait(timeout=support.SHORT_TIMEOUT) self.assertTrue(collected, "Stale reference not collected within timeout.") diff --git a/Lib/test/test_concurrent_futures/test_process_pool.py b/Lib/test/test_concurrent_futures/test_process_pool.py index 3e61b0c9387c6f..7fc59a05f3deac 100644 --- a/Lib/test/test_concurrent_futures/test_process_pool.py +++ b/Lib/test/test_concurrent_futures/test_process_pool.py @@ -98,6 +98,7 @@ def test_ressources_gced_in_workers(self): # explicitly destroy the object to ensure that EventfulGCObj.__del__() # is called while manager is still running. + support.gc_collect() obj = None support.gc_collect() diff --git a/Makefile.pre.in b/Makefile.pre.in index fff3d3c4914e7a..97bda527c47736 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -405,6 +405,7 @@ PYTHON_OBJS= \ Python/ast_opt.o \ Python/ast_unparse.o \ Python/bltinmodule.o \ + Python/brc.o \ Python/ceval.o \ Python/codecs.o \ Python/compile.o \ @@ -1808,6 +1809,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_ast_state.h \ $(srcdir)/Include/internal/pycore_atexit.h \ $(srcdir)/Include/internal/pycore_bitutils.h \ + $(srcdir)/Include/internal/pycore_brc.h \ $(srcdir)/Include/internal/pycore_bytes_methods.h \ $(srcdir)/Include/internal/pycore_bytesobject.h \ $(srcdir)/Include/internal/pycore_call.h \ diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 40ff131b119d66..b2c8f47e59b8c9 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -647,6 +647,10 @@ PyOS_AfterFork_Child(void) goto fatal_error; } +#ifdef Py_GIL_DISABLED + _Py_brc_after_fork(tstate->interp); +#endif + _PySignal_AfterFork(); status = _PyInterpreterState_DeleteExceptMain(runtime); diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 23d7e9b5e38a35..97f3c602d1cf87 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -5581,7 +5581,8 @@ _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv) // Don't try this at home, kids: dict->ma_keys = NULL; dict->ma_values = NULL; - Py_DECREF(dict); + Py_SET_REFCNT(dict, 0); + _Py_Dealloc((PyObject *)dict); return true; } diff --git a/Objects/object.c b/Objects/object.c index 587c5528c01345..194fee5137ecbc 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -2,6 +2,7 @@ /* Generic object operations; and implementation of None */ #include "Python.h" +#include "pycore_brc.h" // _Py_brc_queue_object() #include "pycore_call.h" // _PyObject_CallNoArgs() #include "pycore_ceval.h" // _Py_EnterRecursiveCallTstate() #include "pycore_context.h" // _PyContextTokenMissing_Type @@ -344,12 +345,7 @@ _Py_DecRefSharedDebug(PyObject *o, const char *filename, int lineno) &shared, new_shared)); if (should_queue) { - // TODO: the inter-thread queue is not yet implemented. For now, - // we just merge the refcount here. - Py_ssize_t refcount = _Py_ExplicitMergeRefcount(o, -1); - if (refcount == 0) { - _Py_Dealloc(o); - } + _Py_brc_queue_object(o); } else if (new_shared == _Py_REF_MERGED) { // refcount is zero AND merged diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 35788ec4503e8f..49f529ebbc2f9b 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -191,6 +191,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 7a44179e356105..5b1bd7552b4cd9 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -46,6 +46,9 @@ Source Files + + Python + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index e1ff97659659ee..4cc0ca4b9af8de 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -206,6 +206,7 @@ + @@ -553,6 +554,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 4c55f23006b2f0..ceaa21217267cf 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -546,6 +546,9 @@ Include\internal + + Include\internal + Include\internal @@ -1253,6 +1256,9 @@ Python + + Python + Python diff --git a/Python/brc.c b/Python/brc.c new file mode 100644 index 00000000000000..41cf6de686a54f --- /dev/null +++ b/Python/brc.c @@ -0,0 +1,183 @@ +// Implementation of biased reference counting inter-thread queue. +// +// Biased reference counting maintains two refcount fields in each object: +// ob_ref_local and ob_ref_shared. The true refcount is the sum of these two +// fields. In some cases, when refcounting operations are split across threads, +// the ob_ref_shared field can be negative (although the total refcount must +// be at least zero). In this case, the thread that decremented the refcount +// requests that the owning thread give up ownership and merge the refcount +// fields. This file implements the mechanism for doing so. +// +// Each thread state maintains a queue of objects whose refcounts it should +// merge. The thread states are stored in a per-interpreter hash table by +// thread id. The hash table has a fixed size and uses a linked list to store +// thread states within each bucket. +// +// The queueing thread uses the eval breaker mechanism to notify the owning +// thread that it has objects to merge. Additionaly, all queued objects are +// merged during GC. +#include "Python.h" +#include "pycore_object.h" // _Py_ExplicitMergeRefcount +#include "pycore_brc.h" // struct _brc_thread_state +#include "pycore_ceval.h" // _Py_set_eval_breaker_bit +#include "pycore_llist.h" // struct llist_node +#include "pycore_pystate.h" // _PyThreadStateImpl + +#ifdef Py_GIL_DISABLED + +// Get the hashtable bucket for a given thread id. +static struct _brc_bucket * +get_bucket(PyInterpreterState *interp, uintptr_t tid) +{ + return &interp->brc.table[tid % _Py_BRC_NUM_BUCKETS]; +} + +// Find the thread state in a hash table bucket by thread id. +static _PyThreadStateImpl * +find_thread_state(struct _brc_bucket *bucket, uintptr_t thread_id) +{ + struct llist_node *node; + llist_for_each(node, &bucket->root) { + // Get the containing _PyThreadStateImpl from the linked-list node. + _PyThreadStateImpl *ts = llist_data(node, _PyThreadStateImpl, + brc.bucket_node); + if (ts->brc.tid == thread_id) { + return ts; + } + } + return NULL; +} + +// Enqueue an object to be merged by the owning thread. This steals a +// reference to the object. +void +_Py_brc_queue_object(PyObject *ob) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + + uintptr_t ob_tid = _Py_atomic_load_uintptr(&ob->ob_tid); + if (ob_tid == 0) { + // The owning thread may have concurrently decided to merge the + // refcount fields. + Py_DECREF(ob); + return; + } + + struct _brc_bucket *bucket = get_bucket(interp, ob_tid); + PyMutex_Lock(&bucket->mutex); + _PyThreadStateImpl *tstate = find_thread_state(bucket, ob_tid); + if (tstate == NULL) { + // If we didn't find the owning thread then it must have already exited. + // It's safe (and necessary) to merge the refcount. Subtract one when + // merging because we've stolen a reference. + Py_ssize_t refcount = _Py_ExplicitMergeRefcount(ob, -1); + PyMutex_Unlock(&bucket->mutex); + if (refcount == 0) { + _Py_Dealloc(ob); + } + return; + } + + if (_PyObjectStack_Push(&tstate->brc.objects_to_merge, ob) < 0) { + PyMutex_Unlock(&bucket->mutex); + + // Fall back to stopping all threads and manually merging the refcount + // if we can't enqueue the object to be merged. + _PyEval_StopTheWorld(interp); + Py_ssize_t refcount = _Py_ExplicitMergeRefcount(ob, -1); + _PyEval_StartTheWorld(interp); + + if (refcount == 0) { + _Py_Dealloc(ob); + } + return; + } + + // Notify owning thread + _Py_set_eval_breaker_bit(interp, _PY_EVAL_EXPLICIT_MERGE_BIT, 1); + + PyMutex_Unlock(&bucket->mutex); +} + +static void +merge_queued_objects(_PyObjectStack *to_merge) +{ + PyObject *ob; + while ((ob = _PyObjectStack_Pop(to_merge)) != NULL) { + // Subtract one when merging because the queue had a reference. + Py_ssize_t refcount = _Py_ExplicitMergeRefcount(ob, -1); + if (refcount == 0) { + _Py_Dealloc(ob); + } + } +} + +// Process this thread's queue of objects to merge. +void +_Py_brc_merge_refcounts(PyThreadState *tstate) +{ + struct _brc_thread_state *brc = &((_PyThreadStateImpl *)tstate)->brc; + struct _brc_bucket *bucket = get_bucket(tstate->interp, brc->tid); + + // Append all objects into a local stack. We don't want to hold the lock + // while calling destructors. + PyMutex_Lock(&bucket->mutex); + _PyObjectStack_Merge(&brc->local_objects_to_merge, &brc->objects_to_merge); + PyMutex_Unlock(&bucket->mutex); + + // Process the local stack until it's empty + merge_queued_objects(&brc->local_objects_to_merge); +} + +void +_Py_brc_init_state(PyInterpreterState *interp) +{ + struct _brc_state *brc = &interp->brc; + for (Py_ssize_t i = 0; i < _Py_BRC_NUM_BUCKETS; i++) { + llist_init(&brc->table[i].root); + } +} + +void +_Py_brc_init_thread(PyThreadState *tstate) +{ + struct _brc_thread_state *brc = &((_PyThreadStateImpl *)tstate)->brc; + brc->tid = _Py_ThreadId(); + + // Add ourself to the hashtable + struct _brc_bucket *bucket = get_bucket(tstate->interp, brc->tid); + PyMutex_Lock(&bucket->mutex); + llist_insert_tail(&bucket->root, &brc->bucket_node); + PyMutex_Unlock(&bucket->mutex); +} + +void +_Py_brc_remove_thread(PyThreadState *tstate) +{ + struct _brc_thread_state *brc = &((_PyThreadStateImpl *)tstate)->brc; + struct _brc_bucket *bucket = get_bucket(tstate->interp, brc->tid); + + // Remove ourself from the hashtable + PyMutex_Lock(&bucket->mutex); + llist_remove(&brc->bucket_node); + _PyObjectStack_Merge(&brc->local_objects_to_merge, &brc->objects_to_merge); + PyMutex_Unlock(&bucket->mutex); + + // Process "local_queue" until it's empty + merge_queued_objects(&brc->local_objects_to_merge); +} + +void +_Py_brc_after_fork(PyInterpreterState *interp) +{ + // Unlock all bucket mutexes. Some of the buckets may be locked because + // locks can be handed off to a parked thread (see lock.c). We don't have + // to worry about consistency here, becuase no thread can be actively + // modifying a bucket, but it might be paused (not yet woken up) on a + // PyMutex_Lock while holding that lock. + for (Py_ssize_t i = 0; i < _Py_BRC_NUM_BUCKETS; i++) { + _PyMutex_at_fork_reinit(&interp->brc.table[i].mutex); + } +} + +#endif /* Py_GIL_DISABLED */ diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index f3b169241535f3..9028c53ec49f02 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -972,6 +972,14 @@ _Py_HandlePending(PyThreadState *tstate) } } +#ifdef Py_GIL_DISABLED + /* Objects with refcounts to merge */ + if (_Py_eval_breaker_bit_is_set(interp, _PY_EVAL_EXPLICIT_MERGE_BIT)) { + _Py_set_eval_breaker_bit(interp, _PY_EVAL_EXPLICIT_MERGE_BIT, 0); + _Py_brc_merge_refcounts(tstate); + } +#endif + /* GC scheduled to run */ if (_Py_eval_breaker_bit_is_set(interp, _PY_GC_SCHEDULED_BIT)) { _Py_set_eval_breaker_bit(interp, _PY_GC_SCHEDULED_BIT, 0); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index a6513a2c4aba2a..7cef47e1834008 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1,5 +1,6 @@ // Cyclic garbage collector implementation for free-threaded build. #include "Python.h" +#include "pycore_brc.h" // struct _brc_thread_state #include "pycore_ceval.h" // _Py_set_eval_breaker_bit() #include "pycore_context.h" #include "pycore_dict.h" // _PyDict_MaybeUntrack() @@ -149,8 +150,7 @@ gc_decref(PyObject *op) op->ob_tid -= 1; } -// Merge refcounts while the world is stopped. -static void +static Py_ssize_t merge_refcount(PyObject *op, Py_ssize_t extra) { assert(_PyInterpreterState_GET()->stoptheworld.world_stopped); @@ -166,6 +166,7 @@ merge_refcount(PyObject *op, Py_ssize_t extra) op->ob_tid = 0; op->ob_ref_local = 0; op->ob_ref_shared = _Py_REF_SHARED(refcount, _Py_REF_MERGED); + return refcount; } static void @@ -279,6 +280,38 @@ gc_visit_heaps(PyInterpreterState *interp, mi_block_visit_fun *visitor, return err; } +static void +merge_queued_objects(_PyThreadStateImpl *tstate, struct collection_state *state) +{ + struct _brc_thread_state *brc = &tstate->brc; + _PyObjectStack_Merge(&brc->local_objects_to_merge, &brc->objects_to_merge); + + PyObject *op; + while ((op = _PyObjectStack_Pop(&brc->local_objects_to_merge)) != NULL) { + // Subtract one when merging because the queue had a reference. + Py_ssize_t refcount = merge_refcount(op, -1); + + if (!_PyObject_GC_IS_TRACKED(op) && refcount == 0) { + // GC objects with zero refcount are handled subsequently by the + // GC as if they were cyclic trash, but we have to handle dead + // non-GC objects here. Add one to the refcount so that we can + // decref and deallocate the object once we start the world again. + op->ob_ref_shared += (1 << _Py_REF_SHARED_SHIFT); + worklist_push(&state->objs_to_decref, op); + } + } +} + +static void +merge_all_queued_objects(PyInterpreterState *interp, struct collection_state *state) +{ + HEAD_LOCK(&_PyRuntime); + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + merge_queued_objects((_PyThreadStateImpl *)p, state); + } + HEAD_UNLOCK(&_PyRuntime); +} + // Subtract an incoming reference from the computed "gc_refs" refcount. static int visit_decref(PyObject *op, void *arg) @@ -918,6 +951,9 @@ static void gc_collect_internal(PyInterpreterState *interp, struct collection_state *state) { _PyEval_StopTheWorld(interp); + // merge refcounts for all queued objects + merge_all_queued_objects(interp, state); + // Find unreachable objects int err = deduce_unreachable_heap(interp, state); if (err < 0) { @@ -937,6 +973,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state) clear_weakrefs(state); _PyEval_StartTheWorld(interp); + // Deallocate any object from the refcount merge step + cleanup_worklist(&state->objs_to_decref); + // Call weakref callbacks and finalizers after unpausing other threads to // avoid potential deadlocks. call_weakref_callbacks(state); diff --git a/Python/object_stack.c b/Python/object_stack.c index 8544892eb71dcb..ced4460da00f44 100644 --- a/Python/object_stack.c +++ b/Python/object_stack.c @@ -67,6 +67,27 @@ _PyObjectStack_Clear(_PyObjectStack *queue) } } +void +_PyObjectStack_Merge(_PyObjectStack *dst, _PyObjectStack *src) +{ + if (src->head == NULL) { + return; + } + + if (dst->head != NULL) { + // First, append dst to the bottom of src + _PyObjectStackChunk *last = src->head; + while (last->prev != NULL) { + last = last->prev; + } + last->prev = dst->head; + } + + // Now that src has all the chunks, set dst to src + dst->head = src->head; + src->head = NULL; +} + void _PyObjectStackChunk_ClearFreeList(_PyFreeListState *free_lists, int is_finalization) { diff --git a/Python/pystate.c b/Python/pystate.c index 430121a6a35d7f..951f58eb210d68 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -609,6 +609,9 @@ init_interpreter(PyInterpreterState *interp, _PyGC_InitState(&interp->gc); PyConfig_InitPythonConfig(&interp->config); _PyType_InitCache(interp); +#ifdef Py_GIL_DISABLED + _Py_brc_init_state(interp); +#endif for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { interp->monitors.tools[i] = 0; } @@ -1334,6 +1337,11 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->datastack_limit = NULL; tstate->what_event = -1; +#ifdef Py_GIL_DISABLED + // Initialize biased reference counting inter-thread queue + _Py_brc_init_thread(tstate); +#endif + if (interp->stoptheworld.requested || _PyRuntime.stoptheworld.requested) { // Start in the suspended state if there is an ongoing stop-the-world. tstate->state = _Py_THREAD_SUSPENDED; @@ -1556,6 +1564,9 @@ PyThreadState_Clear(PyThreadState *tstate) _PyFreeListState *freelist_state = &((_PyThreadStateImpl*)tstate)->freelist_state; _Py_ClearFreeLists(freelist_state, 1); _PySlice_ClearCache(freelist_state); + + // Remove ourself from the biased reference counting table of threads. + _Py_brc_remove_thread(tstate); #endif _PyThreadState_ClearMimallocHeaps(tstate); From ee018914fd5790a501bd5443fb5c1727b2be384a Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 2 Feb 2024 16:32:29 +0000 Subject: [PATCH 2/8] Update comments from review --- Include/internal/pycore_brc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_brc.h b/Include/internal/pycore_brc.h index c7a7c47eb1f323..3453d83b57ca97 100644 --- a/Include/internal/pycore_brc.h +++ b/Include/internal/pycore_brc.h @@ -21,7 +21,7 @@ extern "C" { // Hash table bucket struct _brc_bucket { - // Mutex protects both the bucket and thread queues in this bucket. + // Mutex protects both the bucket and thread state queues in this bucket. PyMutex mutex; // Linked list of _PyThreadStateImpl objects hashed to this bucket. @@ -30,8 +30,8 @@ struct _brc_bucket { // Per-interpreter biased reference counting state struct _brc_state { - // Hash table of thread states by thread-id. Threads within a bucket are - // chained using a doubly-linked list. + // Hash table of thread states by thread-id. Thread states within a bucket + // are chained using a doubly-linked list. struct _brc_bucket table[_Py_BRC_NUM_BUCKETS]; }; From cbfaf6b3c0cbe48f297445dbc79288f8c5aab915 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 2 Feb 2024 16:32:35 +0000 Subject: [PATCH 3/8] Fix _Py_brc_remove_thread --- Python/brc.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/Python/brc.c b/Python/brc.c index 41cf6de686a54f..f1fd57a2964cf5 100644 --- a/Python/brc.c +++ b/Python/brc.c @@ -157,14 +157,29 @@ _Py_brc_remove_thread(PyThreadState *tstate) struct _brc_thread_state *brc = &((_PyThreadStateImpl *)tstate)->brc; struct _brc_bucket *bucket = get_bucket(tstate->interp, brc->tid); - // Remove ourself from the hashtable - PyMutex_Lock(&bucket->mutex); - llist_remove(&brc->bucket_node); - _PyObjectStack_Merge(&brc->local_objects_to_merge, &brc->objects_to_merge); - PyMutex_Unlock(&bucket->mutex); + // We need to fully process any objects to merge before removing ourself + // from the hashtable. It is not safe to perform any refcount operations + // after we are removed. After that point, other threads treat our objects + // as abandoned and may merge the objects' refcounts directly. + bool empty = false; + while (!empty) { + // Process the local stack until it's empty + merge_queued_objects(&brc->local_objects_to_merge); + + PyMutex_Lock(&bucket->mutex); + empty = (brc->objects_to_merge.head == NULL); + if (empty) { + llist_remove(&brc->bucket_node); + } + else { + _PyObjectStack_Merge(&brc->local_objects_to_merge, + &brc->objects_to_merge); + } + PyMutex_Unlock(&bucket->mutex); + } - // Process "local_queue" until it's empty - merge_queued_objects(&brc->local_objects_to_merge); + assert(brc->local_objects_to_merge.head == NULL); + assert(brc->objects_to_merge.head == NULL); } void From 127c595084d3b2132d528dd039856f07fb899982 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 2 Feb 2024 22:47:00 +0000 Subject: [PATCH 4/8] Include pycore_freelist.h in pycore_object_stack.h --- Include/internal/pycore_object_stack.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_object_stack.h b/Include/internal/pycore_object_stack.h index 8c2f9001efda62..d042be2a98090a 100644 --- a/Include/internal/pycore_object_stack.h +++ b/Include/internal/pycore_object_stack.h @@ -1,6 +1,8 @@ #ifndef Py_INTERNAL_OBJECT_STACK_H #define Py_INTERNAL_OBJECT_STACK_H +#include "pycore_freelist.h" // _PyFreeListState + #ifdef __cplusplus extern "C" { #endif @@ -32,8 +34,6 @@ _PyObjectStackChunk_New(void); extern void _PyObjectStackChunk_Free(_PyObjectStackChunk *); -typedef struct _Py_freelist_state _PyFreeListState; - extern void _PyObjectStackChunk_ClearFreeList(_PyFreeListState *state, int is_finalization); From f00252a816e50862f45f390c41513a8c58809b98 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Mon, 5 Feb 2024 21:32:31 +0000 Subject: [PATCH 5/8] Move _Py_brc_after_fork up --- Modules/posixmodule.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index b2c8f47e59b8c9..1ae43a4d483118 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -637,6 +637,10 @@ PyOS_AfterFork_Child(void) tstate->native_thread_id = PyThread_get_thread_native_id(); #endif +#ifdef Py_GIL_DISABLED + _Py_brc_after_fork(tstate->interp); +#endif + status = _PyEval_ReInitThreads(tstate); if (_PyStatus_EXCEPTION(status)) { goto fatal_error; @@ -647,10 +651,6 @@ PyOS_AfterFork_Child(void) goto fatal_error; } -#ifdef Py_GIL_DISABLED - _Py_brc_after_fork(tstate->interp); -#endif - _PySignal_AfterFork(); status = _PyInterpreterState_DeleteExceptMain(runtime); From 51c662540bfcbaa9fadeaef5fc840b3c8504e254 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 8 Feb 2024 18:49:26 +0000 Subject: [PATCH 6/8] Fix refleak tests. Revert back to Py_DECREF() instead of the Py_SET_REFCNT(dict, 0) and fix check that this thread owns the only reference to the dict. That ensures that the final Py_DECREF call immediately frees the dict instead of possibly enqueuing it to be merged. --- Objects/dictobject.c | 19 ++++++++++++++++--- Python/gc_free_threading.c | 3 +++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Objects/dictobject.c b/Objects/dictobject.c index dbc7237a94c0e7..9b1defa5cbc609 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -5989,6 +5989,18 @@ _PyObject_MakeDictFromInstanceAttributes(PyObject *obj, PyDictValues *values) return make_dict_from_instance_attributes(interp, keys, values); } +static bool +has_unique_reference(PyObject *op) +{ +#ifdef Py_GIL_DISABLED + return (_Py_IsOwnedByCurrentThread(op) && + op->ob_ref_local == 1 && + _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared) == 0); +#else + return Py_REFCNT(op) == 1; +#endif +} + // Return true if the dict was dematerialized, false otherwise. bool _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv) @@ -6005,7 +6017,9 @@ _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv) return false; } assert(_PyType_HasFeature(Py_TYPE(obj), Py_TPFLAGS_HEAPTYPE)); - if (dict->ma_keys != CACHED_KEYS(Py_TYPE(obj)) || Py_REFCNT(dict) != 1) { + if (dict->ma_keys != CACHED_KEYS(Py_TYPE(obj)) || + !has_unique_reference((PyObject *)dict)) + { return false; } assert(dict->ma_values); @@ -6016,8 +6030,7 @@ _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv) // Don't try this at home, kids: dict->ma_keys = NULL; dict->ma_values = NULL; - Py_SET_REFCNT(dict, 0); - _Py_Dealloc((PyObject *)dict); + Py_DECREF(dict); return true; } diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 0f77d33ad7cfa6..5d3b097dee93e8 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -300,6 +300,9 @@ merge_queued_objects(_PyThreadStateImpl *tstate, struct collection_state *state) // non-GC objects here. Add one to the refcount so that we can // decref and deallocate the object once we start the world again. op->ob_ref_shared += (1 << _Py_REF_SHARED_SHIFT); +#ifdef Py_REF_DEBUG + _Py_IncRefTotal(_PyInterpreterState_GET()); +#endif worklist_push(&state->objs_to_decref, op); } } From c63185678fa303832c58135cb3f5c26af02d91fb Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 8 Feb 2024 20:26:32 +0000 Subject: [PATCH 7/8] Fix test_weakref.test_threaded_weak_key_dict_copy refleak --- Objects/object.c | 11 +++++++---- Objects/obmalloc.c | 7 ++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Objects/object.c b/Objects/object.c index 9e1ab98a848189..dd7d6334977f7d 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -345,6 +345,9 @@ _Py_DecRefSharedDebug(PyObject *o, const char *filename, int lineno) &shared, new_shared)); if (should_queue) { +#ifdef Py_REF_DEBUG + _Py_IncRefTotal(_PyInterpreterState_GET()); +#endif _Py_brc_queue_object(o); } else if (new_shared == _Py_REF_MERGED) { @@ -395,13 +398,13 @@ _Py_ExplicitMergeRefcount(PyObject *op, Py_ssize_t extra) Py_ssize_t shared = _Py_atomic_load_ssize_relaxed(&op->ob_ref_shared); do { refcnt = Py_ARITHMETIC_RIGHT_SHIFT(Py_ssize_t, shared, _Py_REF_SHARED_SHIFT); - if (_Py_REF_IS_MERGED(shared)) { - return refcnt; - } - refcnt += (Py_ssize_t)op->ob_ref_local; refcnt += extra; +#ifdef Py_REF_DEBUG + _Py_AddRefTotal(_PyInterpreterState_GET(), extra); +#endif + new_shared = _Py_REF_SHARED(refcnt, _Py_REF_MERGED); } while (!_Py_atomic_compare_exchange_ssize(&op->ob_ref_shared, &shared, new_shared)); diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index bea4ea85332bdd..40f55c082e6ebf 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -1073,7 +1073,12 @@ get_mimalloc_allocated_blocks(PyInterpreterState *interp) mi_heap_visit_blocks(heap, false, &count_blocks, &allocated_blocks); } } - // TODO(sgross): count blocks in abandoned segments. + + mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; + for (uint8_t tag = 0; tag < _Py_MIMALLOC_HEAP_COUNT; tag++) { + _mi_abandoned_pool_visit_blocks(pool, tag, false, &count_blocks, + &allocated_blocks); + } #else // TODO(sgross): this only counts the current thread's blocks. mi_heap_t *heap = mi_heap_get_default(); From 97cf9645209457a80e9f08f9d8c0bdf541302e42 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 9 Feb 2024 14:45:23 +0000 Subject: [PATCH 8/8] Remove duplciate _Py_AddRefTotal from merge commit --- Objects/object.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Objects/object.c b/Objects/object.c index 626b0145869250..61e6131c6e99bb 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -401,10 +401,6 @@ _Py_ExplicitMergeRefcount(PyObject *op, Py_ssize_t extra) refcnt += (Py_ssize_t)op->ob_ref_local; refcnt += extra; -#ifdef Py_REF_DEBUG - _Py_AddRefTotal(_PyInterpreterState_GET(), extra); -#endif - new_shared = _Py_REF_SHARED(refcnt, _Py_REF_MERGED); } while (!_Py_atomic_compare_exchange_ssize(&op->ob_ref_shared, &shared, new_shared));