diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index d521eac79d1b97..ee0823d1533f2d 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -60,8 +60,8 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); -int -_PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); +int _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); +int _PyOptimizer_Unanchored(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **pexecutor, PyObject **stack_pointer); extern _PyOptimizerObject _PyOptimizer_Default; diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h index 153884f4bd2902..b92b2678bfa913 100644 --- a/Include/internal/pycore_uops.h +++ b/Include/internal/pycore_uops.h @@ -21,6 +21,9 @@ typedef struct { typedef struct { _PyExecutorObject base; + // Auxiliary arrays, allocated after trace[base.ob_size] + uint16_t *counters; // An array of counters + _PyExecutorObject **executors; // An array of executors _PyUOpInstruction trace[1]; } _PyUOpExecutorObject; diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 9f4731103c9413..a6ce3a22a32aca 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -539,6 +539,27 @@ def testfunc(n): # too much already. self.assertEqual(count, 1) + def test_side_exits(self): + def testfunc(): + for _ in range(100): + for i in range(100): + if i >= 70: + i = 0 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc() + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_FALSE_POP", uops) + subs = [sub for sub in ex.sub_executors() if sub is not None] + self.assertGreater(len(subs), 0) + sub = subs[0] + sub_uops = {opname for opname, _, _ in sub} + self.assertIn("_GUARD_IS_TRUE_POP", sub_uops) + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst new file mode 100644 index 00000000000000..dbc4d9a8309b3c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst @@ -0,0 +1,2 @@ +In the Tier 2 interpreter, add side exits to sub-executors for certain +micro-opcodes (currently only conditional branches). diff --git a/Python/ceval.c b/Python/ceval.c index 27304d31e27949..93e2d705d84584 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -755,6 +755,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int next_instr = frame->instr_ptr; resume_frame: stack_pointer = _PyFrame_GetStackPointer(frame); +resume_frame_using_stack_pointer: #ifdef LLTRACE lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS()); @@ -1063,17 +1064,123 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // Jump here from DEOPT_IF() deoptimize: - next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); + frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n", uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target, (int)(next_uop - current_executor->trace - 1), _PyOpcode_OpName[frame->instr_ptr->op.code]); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); - Py_DECREF(current_executor); - DISPATCH(); + frame->return_offset = 0; // Don't leave this random + + // Check if there is a side-exit executor here already. + int pc = (int)(next_uop - 1 - current_executor->trace); + _PyExecutorObject **pexecutor = current_executor->executors + pc; + if (*pexecutor != NULL) { +#ifdef Py_DEBUG + PyCodeObject *code = _PyFrame_GetCode(frame); + DPRINTF(2, "Jumping to new executor for %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); +#endif + _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor); + Py_DECREF(current_executor); + current_executor = new_executor; + goto enter_tier_two; + } + + // Increment and check side exit counter. + // (Even though we only need it for certain opcodes.) + next_instr = frame->instr_ptr; + uint16_t *pcounter = current_executor->counters + pc; + *pcounter += 1 << OPTIMIZER_BITS_IN_COUNTER; + /* We are using unsigned values, but we really want signed values, so + * do the 2s complement comparison manually */ + uint16_t ucounter = *pcounter + (1 << 15); + uint16_t threshold = tstate->interp->optimizer_resume_threshold + (1 << 15); + if (ucounter <= threshold) + { + Py_DECREF(current_executor); + goto resume_frame_using_stack_pointer; + } + + // Decode instruction to look past EXTENDED_ARG. + opcode = next_instr[0].op.code; + if (opcode == EXTENDED_ARG) { + opcode = next_instr[1].op.code; + } + + // For selected opcodes build a new executor and enter it now. + if (opcode == POP_JUMP_IF_FALSE || + opcode == POP_JUMP_IF_TRUE || + opcode == POP_JUMP_IF_NONE || + opcode == POP_JUMP_IF_NOT_NONE) + { + DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", + _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); + DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); + + _PyExecutorObject *tmp_executor = NULL; + int optimized = _PyOptimizer_Unanchored(frame, next_instr, &tmp_executor, stack_pointer); + if (optimized < 0) { + goto error_tier_two; + } + if (!optimized) { + DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", + _PyUOpName(uopcode), pc, current_executor); + } + else { +#ifdef Py_DEBUG + DPRINTF(1, "--> Optimized %s @ %d in %p\n", + _PyUOpName(uopcode), pc, current_executor); + PyCodeObject *code = _PyFrame_GetCode(frame); + DPRINTF(2, "Jumping to fresh executor for %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); +#endif + _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(tmp_executor); + + // Reject trace if it repeats the uop that just deoptimized. + int jump_opcode = new_executor->trace[0].opcode; + if (jump_opcode == _IS_NONE) { + jump_opcode = new_executor->trace[1].opcode; + } + if (jump_opcode != uopcode) { + *pexecutor = tmp_executor; + *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); + Py_DECREF(current_executor); + current_executor = new_executor; + goto enter_tier_two; // All systems go! + } + + // The trace is guaranteed to deopt again; forget about it. + DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n", + _PyUOpName(jump_opcode)); + Py_DECREF(tmp_executor); + Py_DECREF(new_executor); + } + } + + // Exponential backoff if we didn't optimize. + int backoff = *pcounter & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); + if (backoff < MINIMUM_TIER2_BACKOFF) { + backoff = MINIMUM_TIER2_BACKOFF; + } + else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) { + backoff++; + } + assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); + *pcounter = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff; + + Py_DECREF(current_executor); + goto resume_frame_using_stack_pointer; } + #if defined(__GNUC__) # pragma GCC diagnostic pop #elif defined(_MSC_VER) /* MS_WINDOWS */ diff --git a/Python/optimizer.c b/Python/optimizer.c index d44e733bc346fa..9a38d599cf39da 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -159,7 +159,7 @@ int _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) { assert(src->op.code == JUMP_BACKWARD); - PyCodeObject *code = (PyCodeObject *)frame->f_executable; + PyCodeObject *code = _PyFrame_GetCode(frame); assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); if (!has_space_for_executor(code, src)) { @@ -189,6 +189,27 @@ _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNI return 1; } +// Return an unanchored executor. The caller owns the executor when returning 1. +// No ENTER_EXECUTOR is inserted, nor is the executor added to the code object. +int +_PyOptimizer_Unanchored( + _PyInterpreterFrame *frame, + _Py_CODEUNIT *instr, + _PyExecutorObject **pexecutor, + PyObject **stack_pointer) +{ + assert(instr->op.code != ENTER_EXECUTOR); + PyCodeObject *code = _PyFrame_GetCode(frame); + assert(PyCode_Check(code)); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyOptimizerObject *opt = interp->optimizer; + if (strcmp(opt->ob_base.ob_type->tp_name, "uop_optimizer") != 0) { + return 0; + } + *pexecutor = NULL; + return opt->optimize(opt, code, instr, pexecutor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); +} + _PyExecutorObject * PyUnstable_GetExecutor(PyCodeObject *code, int offset) { @@ -321,6 +342,11 @@ PyUnstable_Optimizer_NewCounter(void) static void uop_dealloc(_PyUOpExecutorObject *self) { _Py_ExecutorClear((_PyExecutorObject *)self); + if (self->executors != NULL) { + for (Py_ssize_t i = Py_SIZE(self); --i >= 0; ) { + Py_XDECREF(self->executors[i]); + } + } PyObject_Free(self); } @@ -375,15 +401,41 @@ PySequenceMethods uop_as_sequence = { .sq_item = (ssizeargfunc)uop_item, }; +static PyObject * +sub_executors(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + _PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)self; + Py_ssize_t len = uop_len(executor); + PyObject *list = PyList_New(len); + if (list == NULL) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + PyObject *sub = (PyObject *)executor->executors[i]; + if (sub == NULL) { + sub = Py_None; + } + Py_INCREF(sub); + PyList_SET_ITEM(list, i, (PyObject *)sub); + } + return list; +} + +static PyMethodDef uop_executor_methods[] = { + { "is_valid", is_valid, METH_NOARGS, NULL }, + { "sub_executors", sub_executors, METH_NOARGS, NULL }, + { NULL, NULL }, +}; + PyTypeObject _PyUOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", .tp_basicsize = sizeof(_PyUOpExecutorObject) - sizeof(_PyUOpInstruction), - .tp_itemsize = sizeof(_PyUOpInstruction), + .tp_itemsize = sizeof(_PyUOpInstruction) + sizeof(uint16_t) + sizeof(_PyExecutorObject *), .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, - .tp_methods = executor_methods, + .tp_methods = uop_executor_methods, }; /* TO DO -- Generate these tables */ @@ -499,7 +551,7 @@ translate_bytecode_to_trace( code = trace_stack[trace_stack_depth].code; \ instr = trace_stack[trace_stack_depth].instr; - DPRINTF(4, + DPRINTF(2, "Optimizing %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), PyUnicode_AsUTF8(code->co_filename), @@ -825,6 +877,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) if (executor == NULL) { return NULL; } + executor->executors = (_PyExecutorObject **)(&executor->trace[length]); + executor->counters = (uint16_t *)(&executor->executors[length]); + memset(executor->executors, 0, sizeof(_PyExecutorObject *) * length); + memset(executor->counters, 0, sizeof(uint16_t) * length); int dest = length - 1; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { @@ -933,9 +989,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) return NULL; } opt->optimize = uop_optimize; - opt->resume_threshold = INT16_MAX; - // Need at least 3 iterations to settle specializations. - // A few lower bits of the counter are reserved for other flags. + // The lower bits are reserved for exponential backoff. + opt->resume_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; return (PyObject *)opt; } diff --git a/Python/specialize.c b/Python/specialize.c index 7c2a4a42b1dcc3..f8ec59d2ebc273 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -2353,6 +2353,7 @@ int void _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg) { + assert(_PyOpcode_Deopt[instr->op.code] == FOR_ITER); assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER); _PyForIterCache *cache = (_PyForIterCache *)(instr + 1);