From 9a2d5590dbc3f51fdbb6abaaab56e2cdb7262c52 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Thu, 10 Jun 2021 21:32:44 -0500
Subject: [PATCH 1/9] First stab. Reduce pow() overhead for small exponents.
 Incidentally fixed a refcount oversight for negative exponents.

---
 Objects/longobject.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index e1c1191e648dae..79703fa41762e6 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4185,6 +4185,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
                 goto Error;
             Py_DECREF(a);
             a = temp;
+            temp = NULL;
         }
 
         /* Reduce base by modulus in some cases:
@@ -4239,18 +4240,38 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
         REDUCE(result);                         \
     } while(0)
 
-    if (Py_SIZE(b) <= FIVEARY_CUTOFF) {
-        /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
-        /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf    */
-        for (i = Py_SIZE(b) - 1; i >= 0; --i) {
+    i = Py_SIZE(b);
+    if (i <= FIVEARY_CUTOFF) {
+        if (i > 0) {
+            /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
+            /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf    */
+            --i;
             digit bi = b->ob_digit[i];
-
+            /* Find the first significant exponent bit. */
             for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) {
-                MULT(z, z, z);
-                if (bi & j)
-                    MULT(z, a, z);
+                if (bi & j) {
+                    Py_INCREF(a);
+                    Py_DECREF(z); /* z is currently 1 */
+                    z = a;
+                    REDUCE(z);
+                    break;
+                }
             }
-        }
+            assert(j);  /* else we never found a bit set */
+            for (j >>= 1;;) {
+                for (; j != 0; j >>= 1) {
+                    MULT(z, z, z);
+                    if (bi & j) {
+                        MULT(z, a, z);
+                    }
+                }
+                if (--i < 0) {
+                    break;
+                }
+                bi = b->ob_digit[i];
+                j = (digit)1 << (PyLong_SHIFT-1);
+            }
+        }/* if (i > 0); else b is 0, and z=1 is correct */
     }
     else {
         /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */

From ff6e798d741f531f10cda5dea4ff56380bf44c76 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Thu, 10 Jun 2021 22:36:10 -0500
Subject: [PATCH 2/9] Repair that, e.g, pow(False, 1) returned False instead of
 0. It's annoying, but I don't want to fight about it ;-)

---
 Objects/longobject.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index 79703fa41762e6..0f9abc03ccd068 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4250,10 +4250,15 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
             /* Find the first significant exponent bit. */
             for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) {
                 if (bi & j) {
-                    Py_INCREF(a);
-                    Py_DECREF(z); /* z is currently 1 */
-                    z = a;
-                    REDUCE(z);
+                    /* Found the first bit. We would like to simply set z to
+                     * `a` now. But, if we do, and b is 1, pow() will return
+                     * `a` then. Which is of a wrong type if `a` is an instance
+                     * of an int subclass. test_bool actually griped about
+                     * that, demanding that, e.g., pos(False, 1) is not False.
+                     * The seemingly useless multiplication by 1 is done
+                     * solely to worm around that.
+                     */
+                    MULT(z, a, z);
                     break;
                 }
             }

From 7f867fa078862e6de62c132dcac0e0efaf7f4dc8 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Thu, 10 Jun 2021 22:43:33 -0500
Subject: [PATCH 3/9] Fixed typo in comment.

---
 Objects/longobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index 0f9abc03ccd068..c15997d0d43846 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4254,7 +4254,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
                      * `a` now. But, if we do, and b is 1, pow() will return
                      * `a` then. Which is of a wrong type if `a` is an instance
                      * of an int subclass. test_bool actually griped about
-                     * that, demanding that, e.g., pos(False, 1) is not False.
+                     * that, demanding that, e.g., pow(False, 1) is not False.
                      * The seemingly useless multiplication by 1 is done
                      * solely to worm around that.
                      */

From e8c403485926e192bb4021d0ddf586e4ba0fc4fb Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Fri, 11 Jun 2021 11:22:41 -0500
Subject: [PATCH 4/9] Search for the most significant exponent bit
 right-to-left instead of left-to-right. This cuts the time for exponent 2 by
 another 10%. Does that slow things for, e.g., an exponent like 1 << 20? Sure,
 by a little.  But that exponent will go on to do 20 squarings, and the extra
 native 32-bit shift- and-tests in the loop are insignificant in comparison.
 For a tiny exponent (like 2), the loop's shift-and-tests can consume a
 significant part of the total time.

Also reverted the change I made to repair refcounting in
modular inverse code. Mark will make the change in a
distinct PR, and backport it. Without this change, the
"negative refcount" glitch stopped showing up in this PR's
code after I changed it to never return `a` directly.
---
 Objects/longobject.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index c15997d0d43846..4b616e9cabefcf 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4185,7 +4185,6 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
                 goto Error;
             Py_DECREF(a);
             a = temp;
-            temp = NULL;
         }
 
         /* Reduce base by modulus in some cases:
@@ -4247,9 +4246,15 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
             /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf    */
             --i;
             digit bi = b->ob_digit[i];
-            /* Find the first significant exponent bit. */
-            for (j = (digit)1 << (PyLong_SHIFT-1); j != 0; j >>= 1) {
-                if (bi & j) {
+            /* Find the first significant exponent bit. Search right to left
+             * because we're primarily trying to cut overhead for small
+             * exponents, like 2 and 3.
+             */
+            assert(bi);  /* else there is no significant bit */
+            for (j = 2; ; j <<=  1) {
+                if (j > bi) {
+                    assert((bi & j) == 0);
+                    j >>= 1;
                     /* Found the first bit. We would like to simply set z to
                      * `a` now. But, if we do, and b is 1, pow() will return
                      * `a` then. Which is of a wrong type if `a` is an instance
@@ -4262,7 +4267,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
                     break;
                 }
             }
-            assert(j);  /* else we never found a bit set */
+            assert(bi & j);
             for (j >>= 1;;) {
                 for (; j != 0; j >>= 1) {
                     MULT(z, z, z);

From a3ace88e5019dd4d56449fa971b505eef636f61a Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Fri, 11 Jun 2021 17:37:16 +0000
Subject: [PATCH 5/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst   | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst b/Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst
new file mode 100644
index 00000000000000..f854d56b3c8419
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-06-11-17-37-15.bpo-44376.zhM1UW.rst	
@@ -0,0 +1 @@
+Exact integer exponentiation (like ``i**2`` or ``pow(i, 2)``) with a small exponent is much faster, due to reducing overhead in such cases.
\ No newline at end of file

From 7587a02540b0d6420625228c3d461bd58be6585a Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Fri, 11 Jun 2021 16:49:18 -0500
Subject: [PATCH 6/9] Skip the binary business entirely for exponents <= 3. It
 can't win until the exponent is at least 4. Saves approaching another 10% for
 exponent 2.

---
 Objects/longobject.c | 75 ++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index 4b616e9cabefcf..f7d05cfb08b400 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4240,48 +4240,47 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
     } while(0)
 
     i = Py_SIZE(b);
-    if (i <= FIVEARY_CUTOFF) {
-        if (i > 0) {
-            /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
-            /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf    */
-            --i;
-            digit bi = b->ob_digit[i];
-            /* Find the first significant exponent bit. Search right to left
-             * because we're primarily trying to cut overhead for small
-             * exponents, like 2 and 3.
-             */
-            assert(bi);  /* else there is no significant bit */
-            for (j = 2; ; j <<=  1) {
-                if (j > bi) {
-                    assert((bi & j) == 0);
-                    j >>= 1;
-                    /* Found the first bit. We would like to simply set z to
-                     * `a` now. But, if we do, and b is 1, pow() will return
-                     * `a` then. Which is of a wrong type if `a` is an instance
-                     * of an int subclass. test_bool actually griped about
-                     * that, demanding that, e.g., pow(False, 1) is not False.
-                     * The seemingly useless multiplication by 1 is done
-                     * solely to worm around that.
-                     */
+    digit bi = i ? b->ob_digit[i-1] : 0;
+    if (i <= 1 && bi <= 3) {
+        /* exponent <= 3 - just do straight multiplies. Note that the multiply
+         * 1 * a -> z serves a purpose when bi is 1: if `a` is of an int
+         * subclass, it ensures the result is an int. For example, that
+         * pow(False, 1) returns 0 instead of False.
+         */
+        while (bi-- > 0) {
+             MULT(z, a, z);
+        }
+    }
+    else if (i <= FIVEARY_CUTOFF) {
+        /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
+        /* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf    */
+
+        /* Find the first significant exponent bit. Search right to left
+         * because we're primarily trying to cut overhead for small powers.
+         */
+        assert(bi);  /* else there is no significant bit */
+        for (j = 2; ; j <<= 1) {
+            if (j > bi) { /* found the first bit */
+                assert((bi & j) == 0);
+                j >>= 1;
+                MULT(z, a, z);
+                break;
+            }
+        }
+        assert(bi & j);
+        for (--i, j >>= 1;;) {
+            for (; j != 0; j >>= 1) {
+                MULT(z, z, z);
+                if (bi & j) {
                     MULT(z, a, z);
-                    break;
                 }
             }
-            assert(bi & j);
-            for (j >>= 1;;) {
-                for (; j != 0; j >>= 1) {
-                    MULT(z, z, z);
-                    if (bi & j) {
-                        MULT(z, a, z);
-                    }
-                }
-                if (--i < 0) {
-                    break;
-                }
-                bi = b->ob_digit[i];
-                j = (digit)1 << (PyLong_SHIFT-1);
+            if (--i < 0) {
+                break;
             }
-        }/* if (i > 0); else b is 0, and z=1 is correct */
+            bi = b->ob_digit[i];
+            j = (digit)1 << (PyLong_SHIFT-1);
+        }
     }
     else {
         /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */

From 2386d8833d9bfc6c1ce563b63fb7874c6fd2a4c8 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Fri, 11 Jun 2021 18:54:30 -0500
Subject: [PATCH 7/9] Make the bitmask variable of type `digit`. Using a giant
 signed int insteads works OK, but was always a conceptual mess.

---
 Objects/longobject.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index f7d05cfb08b400..adf948f121d7a3 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4241,6 +4241,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
 
     i = Py_SIZE(b);
     digit bi = i ? b->ob_digit[i-1] : 0;
+    digit bit;
     if (i <= 1 && bi <= 3) {
         /* exponent <= 3 - just do straight multiplies. Note that the multiply
          * 1 * a -> z serves a purpose when bi is 1: if `a` is of an int
@@ -4259,19 +4260,19 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
          * because we're primarily trying to cut overhead for small powers.
          */
         assert(bi);  /* else there is no significant bit */
-        for (j = 2; ; j <<= 1) {
-            if (j > bi) { /* found the first bit */
-                assert((bi & j) == 0);
-                j >>= 1;
+        for (bit = 2; ; bit <<= 1) {
+            if (bit > bi) { /* found the first bit */
+                assert((bi & bit) == 0);
+                bit >>= 1;
                 MULT(z, a, z);
                 break;
             }
         }
-        assert(bi & j);
-        for (--i, j >>= 1;;) {
-            for (; j != 0; j >>= 1) {
+        assert(bi & bit);
+        for (--i, bit >>= 1;;) {
+            for (; bit != 0; bit >>= 1) {
                 MULT(z, z, z);
-                if (bi & j) {
+                if (bi & bit) {
                     MULT(z, a, z);
                 }
             }
@@ -4279,7 +4280,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
                 break;
             }
             bi = b->ob_digit[i];
-            j = (digit)1 << (PyLong_SHIFT-1);
+            bit = (digit)1 << (PyLong_SHIFT-1);
         }
     }
     else {

From c424322bf997d21a0a992623bdfa44f54985ec0c Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sat, 12 Jun 2021 10:49:55 -0500
Subject: [PATCH 8/9] And one more stab to reduce overhead for exponents <= 3.
 Close to another 10% saved for exponent 2.

---
 Objects/longobject.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index adf948f121d7a3..980f6b319d22e8 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4243,14 +4243,21 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
     digit bi = i ? b->ob_digit[i-1] : 0;
     digit bit;
     if (i <= 1 && bi <= 3) {
-        /* exponent <= 3 - just do straight multiplies. Note that the multiply
-         * 1 * a -> z serves a purpose when bi is 1: if `a` is of an int
-         * subclass, it ensures the result is an int. For example, that
-         * pow(False, 1) returns 0 instead of False.
-         */
-        while (bi-- > 0) {
-             MULT(z, a, z);
+        /* aim for minimal overhead */
+        if (bi >= 2) {
+            MULT(a, a, z);
+            if (bi == 3) {
+                MULT(z, a, z);
+            }
+        }
+        else if (bi == 1) {
+            /* Multiplying by 1 serves two purposes: if `a` is of an int
+             * subclass, makes the result an int (e.g., pow(False, 1) returns
+             * 0 instead of False), and potetially reduces `a` by the modulus.
+             */
+            MULT(a, z, z);
         }
+        /* else bi is 0, and z==1 is correct */
     }
     else if (i <= FIVEARY_CUTOFF) {
         /* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
@@ -4260,15 +4267,17 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
          * because we're primarily trying to cut overhead for small powers.
          */
         assert(bi);  /* else there is no significant bit */
+        Py_INCREF(a);
+        Py_DECREF(z);
+        z = a;
         for (bit = 2; ; bit <<= 1) {
             if (bit > bi) { /* found the first bit */
                 assert((bi & bit) == 0);
                 bit >>= 1;
-                MULT(z, a, z);
+                assert(bi & bit);
                 break;
             }
         }
-        assert(bi & bit);
         for (--i, bit >>= 1;;) {
             for (; bit != 0; bit >>= 1) {
                 MULT(z, z, z);

From e3cd20a1a3cbca8dc259c3a6953fe3b8f386c122 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sat, 12 Jun 2021 10:58:08 -0500
Subject: [PATCH 9/9] Repaired typo in comment.

---
 Objects/longobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/longobject.c b/Objects/longobject.c
index 980f6b319d22e8..5e29e9a7257093 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4253,7 +4253,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
         else if (bi == 1) {
             /* Multiplying by 1 serves two purposes: if `a` is of an int
              * subclass, makes the result an int (e.g., pow(False, 1) returns
-             * 0 instead of False), and potetially reduces `a` by the modulus.
+             * 0 instead of False), and potentially reduces `a` by the modulus.
              */
             MULT(a, z, z);
         }