From 9a2d5590dbc3f51fdbb6abaaab56e2cdb7262c52 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 10 Jun 2021 21:32:44 -0500 Subject: [PATCH 1/9] First stab. Reduce pow() overhead for small exponents. Incidentally fixed a refcount oversight for negative exponents. --- Objects/longobject.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index e1c1191e648dae..79703fa41762e6 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4185,6 +4185,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) goto Error; Py_DECREF(a); a = temp; + temp = NULL; } /* Reduce base by modulus in some cases: @@ -4239,18 +4240,38 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) REDUCE(result); \ } while(0) - if (Py_SIZE(b) <= FIVEARY_CUTOFF) { - /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */ - /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */ - for (i = Py_SIZE(b) - 1; i >= 0; --i) { + i = Py_SIZE(b); + if (i <= FIVEARY_CUTOFF) { + if (i > 0) { + /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */ + /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */ + --i; digit bi = b->ob_digit[i]; - + /* Find the first significant exponent bit. */ for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) { - MULT(z, z, z); - if (bi & j) - MULT(z, a, z); + if (bi & j) { + Py_INCREF(a); + Py_DECREF(z); /* z is currently 1 */ + z = a; + REDUCE(z); + break; + } } - } + assert(j); /* else we never found a bit set */ + for (j >>= 1;;) { + for (; j != 0; j >>= 1) { + MULT(z, z, z); + if (bi & j) { + MULT(z, a, z); + } + } + if (--i < 0) { + break; + } + bi = b->ob_digit[i]; + j = (digit)1 << (PyLong_SHIFT-1); + } + }/* if (i > 0); else b is 0, and z=1 is correct */ } else { /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */ From ff6e798d741f531f10cda5dea4ff56380bf44c76 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 10 Jun 2021 22:36:10 -0500 Subject: [PATCH 2/9] Repair that, e.g, pow(False, 1) returned False instead of 0. It's annoying, but I don't want to fight about it ;-) --- Objects/longobject.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 79703fa41762e6..0f9abc03ccd068 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4250,10 +4250,15 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) /* Find the first significant exponent bit. */ for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) { if (bi & j) { - Py_INCREF(a); - Py_DECREF(z); /* z is currently 1 */ - z = a; - REDUCE(z); + /* Found the first bit. We would like to simply set z to + * `a` now. But, if we do, and b is 1, pow() will return + * `a` then. Which is of a wrong type if `a` is an instance + * of an int subclass. test_bool actually griped about + * that, demanding that, e.g., pos(False, 1) is not False. + * The seemingly useless multiplication by 1 is done + * solely to worm around that. + */ + MULT(z, a, z); break; } } From 7f867fa078862e6de62c132dcac0e0efaf7f4dc8 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 10 Jun 2021 22:43:33 -0500 Subject: [PATCH 3/9] Fixed typo in comment. --- Objects/longobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 0f9abc03ccd068..c15997d0d43846 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4254,7 +4254,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) * `a` now. But, if we do, and b is 1, pow() will return * `a` then. Which is of a wrong type if `a` is an instance * of an int subclass. test_bool actually griped about - * that, demanding that, e.g., pos(False, 1) is not False. + * that, demanding that, e.g., pow(False, 1) is not False. * The seemingly useless multiplication by 1 is done * solely to worm around that. */ From e8c403485926e192bb4021d0ddf586e4ba0fc4fb Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 11 Jun 2021 11:22:41 -0500 Subject: [PATCH 4/9] Search for the most significant exponent bit right-to-left instead of left-to-right. This cuts the time for exponent 2 by another 10%. Does that slow things for, e.g., an exponent like 1 << 20? Sure, by a little. But that exponent will go on to do 20 squarings, and the extra native 32-bit shift- and-tests in the loop are insignificant in comparison. For a tiny exponent (like 2), the loop's shift-and-tests can consume a significant part of the total time. Also reverted the change I made to repair refcounting in modular inverse code. Mark will make the change in a distinct PR, and backport it. Without this change, the "negative refcount" glitch stopped showing up in this PR's code after I changed it to never return `a` directly. --- Objects/longobject.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index c15997d0d43846..4b616e9cabefcf 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4185,7 +4185,6 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) goto Error; Py_DECREF(a); a = temp; - temp = NULL; } /* Reduce base by modulus in some cases: @@ -4247,9 +4246,15 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */ --i; digit bi = b->ob_digit[i]; - /* Find the first significant exponent bit. */ - for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) { - if (bi & j) { + /* Find the first significant exponent bit. Search right to left + * because we're primarily trying to cut overhead for small + * exponents, like 2 and 3. + */ + assert(bi); /* else there is no significant bit */ + for (j = 2; ; j <<= 1) { + if (j > bi) { + assert((bi & j) == 0); + j >>= 1; /* Found the first bit. We would like to simply set z to * `a` now. But, if we do, and b is 1, pow() will return * `a` then. Which is of a wrong type if `a` is an instance @@ -4262,7 +4267,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) break; } } - assert(j); /* else we never found a bit set */ + assert(bi & j); for (j >>= 1;;) { for (; j != 0; j >>= 1) { MULT(z, z, z); From a3ace88e5019dd4d56449fa971b505eef636f61a Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 11 Jun 2021 17:37:16 +0000 Subject: [PATCH 5/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst b/Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst new file mode 100644 index 00000000000000..f854d56b3c8419 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst @@ -0,0 +1 @@ +Exact integer exponentiation (like ``i**2`` or ``pow(i, 2)``) with a small exponent is much faster, due to reducing overhead in such cases. \ No newline at end of file From 7587a02540b0d6420625228c3d461bd58be6585a Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 11 Jun 2021 16:49:18 -0500 Subject: [PATCH 6/9] Skip the binary business entirely for exponents <= 3. It can't win until the exponent is at least 4. Saves approaching another 10% for exponent 2. --- Objects/longobject.c | 75 ++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 4b616e9cabefcf..f7d05cfb08b400 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4240,48 +4240,47 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) } while(0) i = Py_SIZE(b); - if (i <= FIVEARY_CUTOFF) { - if (i > 0) { - /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */ - /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */ - --i; - digit bi = b->ob_digit[i]; - /* Find the first significant exponent bit. Search right to left - * because we're primarily trying to cut overhead for small - * exponents, like 2 and 3. - */ - assert(bi); /* else there is no significant bit */ - for (j = 2; ; j <<= 1) { - if (j > bi) { - assert((bi & j) == 0); - j >>= 1; - /* Found the first bit. We would like to simply set z to - * `a` now. But, if we do, and b is 1, pow() will return - * `a` then. Which is of a wrong type if `a` is an instance - * of an int subclass. test_bool actually griped about - * that, demanding that, e.g., pow(False, 1) is not False. - * The seemingly useless multiplication by 1 is done - * solely to worm around that. - */ + digit bi = i ? b->ob_digit[i-1] : 0; + if (i <= 1 && bi <= 3) { + /* exponent <= 3 - just do straight multiplies. Note that the multiply + * 1 * a -> z serves a purpose when bi is 1: if `a` is of an int + * subclass, it ensures the result is an int. For example, that + * pow(False, 1) returns 0 instead of False. + */ + while (bi-- > 0) { + MULT(z, a, z); + } + } + else if (i <= FIVEARY_CUTOFF) { + /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */ + /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */ + + /* Find the first significant exponent bit. Search right to left + * because we're primarily trying to cut overhead for small powers. + */ + assert(bi); /* else there is no significant bit */ + for (j = 2; ; j <<= 1) { + if (j > bi) { /* found the first bit */ + assert((bi & j) == 0); + j >>= 1; + MULT(z, a, z); + break; + } + } + assert(bi & j); + for (--i, j >>= 1;;) { + for (; j != 0; j >>= 1) { + MULT(z, z, z); + if (bi & j) { MULT(z, a, z); - break; } } - assert(bi & j); - for (j >>= 1;;) { - for (; j != 0; j >>= 1) { - MULT(z, z, z); - if (bi & j) { - MULT(z, a, z); - } - } - if (--i < 0) { - break; - } - bi = b->ob_digit[i]; - j = (digit)1 << (PyLong_SHIFT-1); + if (--i < 0) { + break; } - }/* if (i > 0); else b is 0, and z=1 is correct */ + bi = b->ob_digit[i]; + j = (digit)1 << (PyLong_SHIFT-1); + } } else { /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */ From 2386d8833d9bfc6c1ce563b63fb7874c6fd2a4c8 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 11 Jun 2021 18:54:30 -0500 Subject: [PATCH 7/9] Make the bitmask variable of type `digit`. Using a giant signed int insteads works OK, but was always a conceptual mess. --- Objects/longobject.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index f7d05cfb08b400..adf948f121d7a3 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4241,6 +4241,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) i = Py_SIZE(b); digit bi = i ? b->ob_digit[i-1] : 0; + digit bit; if (i <= 1 && bi <= 3) { /* exponent <= 3 - just do straight multiplies. Note that the multiply * 1 * a -> z serves a purpose when bi is 1: if `a` is of an int @@ -4259,19 +4260,19 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) * because we're primarily trying to cut overhead for small powers. */ assert(bi); /* else there is no significant bit */ - for (j = 2; ; j <<= 1) { - if (j > bi) { /* found the first bit */ - assert((bi & j) == 0); - j >>= 1; + for (bit = 2; ; bit <<= 1) { + if (bit > bi) { /* found the first bit */ + assert((bi & bit) == 0); + bit >>= 1; MULT(z, a, z); break; } } - assert(bi & j); - for (--i, j >>= 1;;) { - for (; j != 0; j >>= 1) { + assert(bi & bit); + for (--i, bit >>= 1;;) { + for (; bit != 0; bit >>= 1) { MULT(z, z, z); - if (bi & j) { + if (bi & bit) { MULT(z, a, z); } } @@ -4279,7 +4280,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) break; } bi = b->ob_digit[i]; - j = (digit)1 << (PyLong_SHIFT-1); + bit = (digit)1 << (PyLong_SHIFT-1); } } else { From c424322bf997d21a0a992623bdfa44f54985ec0c Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 12 Jun 2021 10:49:55 -0500 Subject: [PATCH 8/9] And one more stab to reduce overhead for exponents <= 3. Close to another 10% saved for exponent 2. --- Objects/longobject.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index adf948f121d7a3..980f6b319d22e8 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4243,14 +4243,21 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) digit bi = i ? b->ob_digit[i-1] : 0; digit bit; if (i <= 1 && bi <= 3) { - /* exponent <= 3 - just do straight multiplies. Note that the multiply - * 1 * a -> z serves a purpose when bi is 1: if `a` is of an int - * subclass, it ensures the result is an int. For example, that - * pow(False, 1) returns 0 instead of False. - */ - while (bi-- > 0) { - MULT(z, a, z); + /* aim for minimal overhead */ + if (bi >= 2) { + MULT(a, a, z); + if (bi == 3) { + MULT(z, a, z); + } + } + else if (bi == 1) { + /* Multiplying by 1 serves two purposes: if `a` is of an int + * subclass, makes the result an int (e.g., pow(False, 1) returns + * 0 instead of False), and potetially reduces `a` by the modulus. + */ + MULT(a, z, z); } + /* else bi is 0, and z==1 is correct */ } else if (i <= FIVEARY_CUTOFF) { /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */ @@ -4260,15 +4267,17 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) * because we're primarily trying to cut overhead for small powers. */ assert(bi); /* else there is no significant bit */ + Py_INCREF(a); + Py_DECREF(z); + z = a; for (bit = 2; ; bit <<= 1) { if (bit > bi) { /* found the first bit */ assert((bi & bit) == 0); bit >>= 1; - MULT(z, a, z); + assert(bi & bit); break; } } - assert(bi & bit); for (--i, bit >>= 1;;) { for (; bit != 0; bit >>= 1) { MULT(z, z, z); From e3cd20a1a3cbca8dc259c3a6953fe3b8f386c122 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 12 Jun 2021 10:58:08 -0500 Subject: [PATCH 9/9] Repaired typo in comment. --- Objects/longobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 980f6b319d22e8..5e29e9a7257093 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4253,7 +4253,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x) else if (bi == 1) { /* Multiplying by 1 serves two purposes: if `a` is of an int * subclass, makes the result an int (e.g., pow(False, 1) returns - * 0 instead of False), and potetially reduces `a` by the modulus. + * 0 instead of False), and potentially reduces `a` by the modulus. */ MULT(a, z, z); }