Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,13 @@ struct _ts {

struct _py_trashcan trash;

/* Tagged pointer to top-most critical section, or zero if there is no
* active critical section. Critical sections are only used in
* `--disable-gil` builds (i.e., when Py_NOGIL is defined to 1). In the
* default build, this field is always zero.
*/
uintptr_t critical_section;

/* Called when a thread state is deleted normally, but not when it
* is destroyed after fork().
* Pain: to prevent rare but fatal shutdown errors (issue 18808),
Expand Down
237 changes: 237 additions & 0 deletions Include/internal/pycore_critical_section.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
#ifndef Py_INTERNAL_CRITICAL_SECTION_H
#define Py_INTERNAL_CRITICAL_SECTION_H

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

#include "pycore_lock.h" // PyMutex
#include "pycore_pystate.h" // _PyThreadState_GET()
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

// Implementation of Python critical sections: helpers to replace the global
// interpreter lock with per-object locks, while avoiding deadlocks.
//
// NOTE: These APIs are no-ops in non-free-threaded builds.
//
// Straightforward per-object locking could introduce deadlocks that were not
// present when running with the GIL. Threads may hold locks for multiple
// objects simultaneously because Python operations can nest. If threads were
// to acquire the same locks in different orders, they would deadlock.
//
// One way to avoid deadlocks is to allow threads to hold only the lock (or
// locks) for a single operation at a time (typically a single lock, but some
// operations involve two locks). When a thread begins a nested operation it
// could suspend the locks for any outer operation: before beginning the nested
// operation, the locks for the outer operation are released and when the
// nested operation completes, the locks for the outer operation are
// reacquired.
//
// To improve performance, this API uses a variation of the above scheme.
// Instead of immediately suspending locks any time a nested operation begins,
// locks are only suspended if the thread would block. This reduces the number
// of lock acquisitions and releases for nested operations, while avoiding
// deadlocks.
//
// Additionally, the locks for any active operation are suspended around
// other potentially blocking operations, such as I/O. This is because the
// interaction between locks and blocking operations can lead to deadlocks in
// the same way as the interaction between multiple locks.
//
// Each thread's critical sections and their corresponding locks are tracked in
// a stack in `PyThreadState.critical_section`. When a thread calls
// `_PyThreadState_Detach()`, such as before a blocking I/O operation or when
// waiting to acquiring a lock, the thread suspends all of it's active critical
// sections, temporarily releasing the associated locks. When the thread calls
// `_PyThreadState_Attach()`, it resumes the top-most (i.e., most recent)
// critical section by reacquiring the associated lock or locks. See
// `_PyCriticalSection_Resume()`.
//
// NOTE: Only the top-most critical section is guaranteed to be active.
// Operations that need to lock two objects at once must use
// `Py_BEGIN_CRITICAL_SECTION2()`. You *CANNOT* use nested critical sections
// to lock more than objects at once, because the inner critical section may
// suspend the outer critical sections. This API does not provide a way to
// lock more than two objects at once.
//
// NOTE: Critical sections implicitly behave like reentrant locks because
// attempting to acquire the same lock will suspend any outer (earlier)
// critical sections. However, they are less efficient for this use case than
// purposefully designed reentrant locks.
//
// Example usage:
// Py_BEGIN_CRITICAL_SECTION(op);
// ...
// Py_END_CRITICAL_SECTION();
//
// To lock two objects at once:
// Py_BEGIN_CRITICAL_SECTION2(op1, op2);
// ...
// Py_END_CRITICAL_SECTION2();


// Tagged pointers to critical sections use the two least significant bits to
// mark if the pointed-to critical section is inactive and whether it is a
// _PyCriticalSection2 object.
#define _Py_CRITICAL_SECTION_INACTIVE 0x1
#define _Py_CRITICAL_SECTION_TWO_MUTEXES 0x2
#define _Py_CRITICAL_SECTION_MASK 0x3

#ifdef Py_NOGIL
# define Py_BEGIN_CRITICAL_SECTION(op) \
{ \
_PyCriticalSection _cs; \
_PyCriticalSection_Begin(&_cs, &_PyObject_CAST(op)->ob_mutex)

# define Py_END_CRITICAL_SECTION() \
_PyCriticalSection_End(&_cs); \
}

# define Py_BEGIN_CRITICAL_SECTION2(a, b) \
{ \
_PyCriticalSection2 _cs2; \
_PyCriticalSection2_Begin(&_cs2, &_PyObject_CAST(a)->ob_mutex, &_PyObject_CAST(b)->ob_mutex)

# define Py_END_CRITICAL_SECTION2() \
_PyCriticalSection2_End(&_cs2); \
}
#else /* !Py_NOGIL */
// The critical section APIs are no-ops with the GIL.
# define Py_BEGIN_CRITICAL_SECTION(op)
# define Py_END_CRITICAL_SECTION()
# define Py_BEGIN_CRITICAL_SECTION2(a, b)
# define Py_END_CRITICAL_SECTION2()
#endif /* !Py_NOGIL */

typedef struct {
// Tagged pointer to an outer active critical section (or 0).
// The two least-significant-bits indicate whether the pointed-to critical
// section is inactive and whether it is a _PyCriticalSection2 object.
uintptr_t prev;

// Mutex used to protect critical section
PyMutex *mutex;
} _PyCriticalSection;

// A critical section protected by two mutexes. Use
// _PyCriticalSection2_Begin and _PyCriticalSection2_End.
typedef struct {
_PyCriticalSection base;

PyMutex *mutex2;
} _PyCriticalSection2;

static inline int
_PyCriticalSection_IsActive(uintptr_t tag)
{
return tag != 0 && (tag & _Py_CRITICAL_SECTION_INACTIVE) == 0;
}

// Resumes the top-most critical section.
PyAPI_FUNC(void)
_PyCriticalSection_Resume(PyThreadState *tstate);

// (private) slow path for locking the mutex
PyAPI_FUNC(void)
_PyCriticalSection_BeginSlow(_PyCriticalSection *c, PyMutex *m);

PyAPI_FUNC(void)
_PyCriticalSection2_BeginSlow(_PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2,
int is_m1_locked);

static inline void
_PyCriticalSection_Begin(_PyCriticalSection *c, PyMutex *m)
{
if (PyMutex_LockFast(&m->v)) {
PyThreadState *tstate = _PyThreadState_GET();
c->mutex = m;
c->prev = tstate->critical_section;
tstate->critical_section = (uintptr_t)c;
}
else {
_PyCriticalSection_BeginSlow(c, m);
}
}

// Removes the top-most critical section from the thread's stack of critical
// sections. If the new top-most critical section is inactive, then it is
// resumed.
static inline void
_PyCriticalSection_Pop(_PyCriticalSection *c)
{
PyThreadState *tstate = _PyThreadState_GET();
uintptr_t prev = c->prev;
tstate->critical_section = prev;

if ((prev & _Py_CRITICAL_SECTION_INACTIVE) != 0) {
_PyCriticalSection_Resume(tstate);
}
}

static inline void
_PyCriticalSection_End(_PyCriticalSection *c)
{
PyMutex_Unlock(c->mutex);
_PyCriticalSection_Pop(c);
}

static inline void
_PyCriticalSection2_Begin(_PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2)
{
if (m1 == m2) {
// If the two mutex arguments are the same, treat this as a critical
// section with a single mutex.
c->mutex2 = NULL;
_PyCriticalSection_Begin(&c->base, m1);
return;
}

if ((uintptr_t)m2 < (uintptr_t)m1) {
// Sort the mutexes so that the lower address is locked first.
// The exact order does not matter, but we need to acquire the mutexes
// in a consistent order to avoid lock ordering deadlocks.
PyMutex *tmp = m1;
m1 = m2;
m2 = tmp;
}

if (PyMutex_LockFast(&m1->v)) {
if (PyMutex_LockFast(&m2->v)) {
PyThreadState *tstate = _PyThreadState_GET();
c->base.mutex = m1;
c->mutex2 = m2;
c->base.prev = tstate->critical_section;

uintptr_t p = (uintptr_t)c | _Py_CRITICAL_SECTION_TWO_MUTEXES;
tstate->critical_section = p;
}
else {
_PyCriticalSection2_BeginSlow(c, m1, m2, 1);
}
}
else {
_PyCriticalSection2_BeginSlow(c, m1, m2, 0);
}
}

static inline void
_PyCriticalSection2_End(_PyCriticalSection2 *c)
{
if (c->mutex2) {
PyMutex_Unlock(c->mutex2);
}
PyMutex_Unlock(c->base.mutex);
_PyCriticalSection_Pop(&c->base);
}

PyAPI_FUNC(void)
_PyCriticalSection_SuspendAll(PyThreadState *tstate);

#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_CRITICAL_SECTION_H */
20 changes: 17 additions & 3 deletions Include/internal/pycore_lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,16 @@ extern "C" {
// PyMutex_Lock(&m);
// ...
// PyMutex_Unlock(&m);
typedef struct _PyMutex {
uint8_t v;
} PyMutex;

// NOTE: In Py_NOGIL builds, `struct _PyMutex` is defined in Include/object.h.
// The Py_NOGIL builds need the definition in Include/object.h for the
// `ob_mutex` field in PyObject. For the default (non-free-threaded) build,
// we define the struct here to avoid exposing it in the public API.
#ifndef Py_NOGIL
struct _PyMutex { uint8_t v; };
#endif

typedef struct _PyMutex PyMutex;

#define _Py_UNLOCKED 0
#define _Py_LOCKED 1
Expand All @@ -46,6 +53,13 @@ PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m);
// (private) slow path for unlocking the mutex
PyAPI_FUNC(void) _PyMutex_UnlockSlow(PyMutex *m);

static inline int
PyMutex_LockFast(uint8_t *lock_bits)
{
uint8_t expected = _Py_UNLOCKED;
return _Py_atomic_compare_exchange_uint8(lock_bits, &expected, _Py_LOCKED);
}

// Locks the mutex.
//
// If the mutex is currently locked, the calling thread will be parked until
Expand Down
8 changes: 6 additions & 2 deletions Include/object.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ check by comparing the reference count field to the immortality reference count.
{ \
0, \
0, \
0, \
{ 0 }, \
0, \
_Py_IMMORTAL_REFCNT_LOCAL, \
0, \
Expand Down Expand Up @@ -204,10 +204,14 @@ struct _object {
// Create a shared field from a refcnt and desired flags
#define _Py_REF_SHARED(refcnt, flags) (((refcnt) << _Py_REF_SHARED_SHIFT) + (flags))

// NOTE: In non-free-threaded builds, `struct _PyMutex` is defined in
// pycore_lock.h. See pycore_lock.h for more details.
struct _PyMutex { uint8_t v; };

struct _object {
uintptr_t ob_tid; // thread id (or zero)
uint16_t _padding;
uint8_t ob_mutex; // per-object lock
struct _PyMutex ob_mutex; // per-object lock
uint8_t ob_gc_bits; // gc-related state
uint32_t ob_ref_local; // local reference count
Py_ssize_t ob_ref_shared; // shared (atomic) reference count
Expand Down
2 changes: 2 additions & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,7 @@ PYTHON_OBJS= \
Python/codecs.o \
Python/compile.o \
Python/context.o \
Python/critical_section.o \
Python/crossinterp.o \
Python/dynamic_annotations.o \
Python/errors.o \
Expand Down Expand Up @@ -1802,6 +1803,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_complexobject.h \
$(srcdir)/Include/internal/pycore_condvar.h \
$(srcdir)/Include/internal/pycore_context.h \
$(srcdir)/Include/internal/pycore_critical_section.h \
$(srcdir)/Include/internal/pycore_crossinterp.h \
$(srcdir)/Include/internal/pycore_dict.h \
$(srcdir)/Include/internal/pycore_dict_state.h \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Implement "Python Critical Sections" from :pep:`703`. These are macros to
help replace the GIL with per-object locks in the ``--disable-gil`` build of
CPython. The macros are no-ops in the default build.
2 changes: 1 addition & 1 deletion Modules/Setup.stdlib.in
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@
@MODULE_XXSUBTYPE_TRUE@xxsubtype xxsubtype.c
@MODULE__XXTESTFUZZ_TRUE@_xxtestfuzz _xxtestfuzz/_xxtestfuzz.c _xxtestfuzz/fuzzer.c
@MODULE__TESTBUFFER_TRUE@_testbuffer _testbuffer.c
@MODULE__TESTINTERNALCAPI_TRUE@_testinternalcapi _testinternalcapi.c _testinternalcapi/test_lock.c _testinternalcapi/pytime.c _testinternalcapi/set.c
@MODULE__TESTINTERNALCAPI_TRUE@_testinternalcapi _testinternalcapi.c _testinternalcapi/test_lock.c _testinternalcapi/pytime.c _testinternalcapi/set.c _testinternalcapi/test_critical_sections.c
@MODULE__TESTCAPI_TRUE@_testcapi _testcapimodule.c _testcapi/vectorcall.c _testcapi/vectorcall_limited.c _testcapi/heaptype.c _testcapi/abstract.c _testcapi/bytearray.c _testcapi/bytes.c _testcapi/unicode.c _testcapi/dict.c _testcapi/set.c _testcapi/list.c _testcapi/tuple.c _testcapi/getargs.c _testcapi/datetime.c _testcapi/docstring.c _testcapi/mem.c _testcapi/watchers.c _testcapi/long.c _testcapi/float.c _testcapi/complex.c _testcapi/numbers.c _testcapi/structmember.c _testcapi/exceptions.c _testcapi/code.c _testcapi/buffer.c _testcapi/pyatomic.c _testcapi/pyos.c _testcapi/file.c _testcapi/codec.c _testcapi/immortal.c _testcapi/heaptype_relative.c _testcapi/gc.c _testcapi/sys.c
@MODULE__TESTCLINIC_TRUE@_testclinic _testclinic.c
@MODULE__TESTCLINIC_LIMITED_TRUE@_testclinic_limited _testclinic_limited.c
Expand Down
3 changes: 3 additions & 0 deletions Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,9 @@ module_exec(PyObject *module)
if (_PyTestInternalCapi_Init_Set(module) < 0) {
return 1;
}
if (_PyTestInternalCapi_Init_CriticalSection(module) < 0) {
return 1;
}

if (PyModule_Add(module, "SIZEOF_PYGC_HEAD",
PyLong_FromSsize_t(sizeof(PyGC_Head))) < 0) {
Expand Down
1 change: 1 addition & 0 deletions Modules/_testinternalcapi/parts.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
int _PyTestInternalCapi_Init_Lock(PyObject *module);
int _PyTestInternalCapi_Init_PyTime(PyObject *module);
int _PyTestInternalCapi_Init_Set(PyObject *module);
int _PyTestInternalCapi_Init_CriticalSection(PyObject *module);

#endif // Py_TESTINTERNALCAPI_PARTS_H
Loading