diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 00000000..1037180b --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,22 @@ +######################################################### +# Build arm64 wheels for OSX on Cirrus CI +######################################################### + +cirrus_wheels_macos_arm64_task: + name: Build macOS arm64 wheels. + macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:13.3.1 + env: + PATH: /opt/homebrew/opt/python@3.10/bin:$PATH + CIBW_ARCHS_MACOS: arm64 + install_pre_requirements_script: + - brew install python@3.10 + - ln -s python3 /opt/homebrew/opt/python@3.10/bin/python + - which python + - python --version + install_cibuildwheel_script: + - python -m pip install cibuildwheel==2.11.4 + run_cibuildwheel_script: + - bin/cibw.sh + wheels_artifacts: + path: "wheelhouse/*" diff --git a/.github/workflows/buildwheel.yml b/.github/workflows/buildwheel.yml index f8b4dd13..34bd3007 100644 --- a/.github/workflows/buildwheel.yml +++ b/.github/workflows/buildwheel.yml @@ -40,7 +40,7 @@ jobs: CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_MANYLINUX_I686_IMAGE: manylinux2014 CIBW_BEFORE_ALL_LINUX: bin/cibw_before_all_linux.sh - CIBW_BEFORE_ALL_MACOS: bin/cibw_before_all_macosx.sh + CIBW_BEFORE_ALL_MACOS: bin/cibw_before_all_macosx_x86_64.sh CIBW_BEFORE_ALL_WINDOWS: msys2 -c bin/cibw_before_all_windows.sh CIBW_BEFORE_BUILD_WINDOWS: msys2 -c bin/cibw_before_build_windows.sh CIBW_BEFORE_BUILD: pip install numpy cython delvewheel diff --git a/bin/build_dependencies_unix.sh b/bin/build_dependencies_unix.sh index 8524a34a..fefa9227 100755 --- a/bin/build_dependencies_unix.sh +++ b/bin/build_dependencies_unix.sh @@ -9,12 +9,15 @@ set -o errexit # # # Supported options: # # # -# --gmp gmp - build based on GMP (default) # -# --gmp mpir - build based on MPIR (instead of GMP) # +# --gmp gmp - build based on GMP (default) # +# --gmp mpir - build based on MPIR (no longer works) # +# --host - set the host (target) for GMP build # +# --patch-gmp-arm64 - apply patch to GMP for OSX arm64 # # # # ------------------------------------------------------------------------- # USE_GMP=gmp +PATCH_GMP_ARM64=no while [[ $# -gt 0 ]] do @@ -41,6 +44,14 @@ do shift shift ;; + --patch-gmp-arm64) + PATCH_GMP_ARM64=yes + shift + ;; + *) + 2>&1 echo "unrecognised argument:" $key + exit 1 + ;; esac done @@ -74,6 +85,23 @@ if [ $USE_GMP = "gmp" ]; then curl -O https://gmplib.org/download/gmp/gmp-$GMPVER.tar.xz tar xf gmp-$GMPVER.tar.xz cd gmp-$GMPVER + + # + # See https://github.com/aleaxit/gmpy/issues/350 + # + # We need to patch GMP for OSX arm64 (Apple M1) hardware. This patch is + # from the GMP repo but was applied after the release of GMP 6.2.1. + # Hopefully when a newer version of GMP is released we will not need to + # apply this patch any more. + # + if [ $PATCH_GMP_ARM64 = "yes" ]; then + echo + echo -------------------------------------------- + echo " patching GMP" + echo -------------------------------------------- + patch -N -Z -p0 < ../../../bin/patch-arm64.diff + fi + # Show the output of configfsf.guess ./configfsf.guess ./configure --prefix=$PREFIX\ @@ -109,6 +137,18 @@ else # # # ----------------------------------------------------------------------- # + # + # The mpir.org domain has expired and no longer hosts the source code so the + # call to curl below will fail. + # We could try to download from https://github.com/wbhart/mpir/releases. + # + # Ultimately it seems that MPIR is no longer maintained though so for now + # this remains unfixed. + # + + >&2 echo "MPIR build of python_flint is no longer supported" + exit 1 + curl -O http://mpir.org/mpir-$MPIRVER.tar.bz2 tar xf mpir-$MPIRVER.tar.bz2 cd mpir-$MPIRVER diff --git a/bin/cibw.sh b/bin/cibw.sh new file mode 100755 index 00000000..1e20ce34 --- /dev/null +++ b/bin/cibw.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# +# This script can be used to test cibuildwheel locally on OSX/Linux +# +# It is also worth commenting out the BEFORE_ALL line to build GMP etc after you have +# built those once because that is by far the slowest step. +# + +rm -f wheelhouse/* + +# bin/build_dependencies_unix.sh places headers and shared libraries under .local +export CIBW_ENVIRONMENT='C_INCLUDE_PATH=$(pwd)/.local/include/ LIBRARY_PATH=$(pwd)/.local/lib/ LD_LIBRARY_PATH=$(pwd)/.local/lib:$LD_LIBRARY_PATH PYTHON_FLINT_MINGW64=true' + +export CIBW_BUILD='cp39-* cp310-* cp311-*' +# export CIBW_BUILD='cp311-*' +export CIBW_SKIP='*-win32 *-manylinux_i686 *-musllinux_*' + +# export CIBW_ARCHS_MACOS="x86_64" +export CIBW_ARCHS_MACOS="arm64" + +export CIBW_BEFORE_ALL_LINUX=bin/cibw_before_all_linux.sh +# export CIBW_BEFORE_ALL_MACOS=bin/cibw_before_all_macosx_x86_64.sh +export CIBW_BEFORE_ALL_MACOS=bin/cibw_before_all_macosx_arm64.sh +export CIBW_BEFORE_ALL_WINDOWS='C:\\msys64\\usr\\bin\\bash bin/cibw_before_all_windows.sh' + +export CIBW_BEFORE_BUILD='pip install numpy cython delvewheel' +export CIBW_BEFORE_BUILD_WINDOWS='C:\\msys64\\usr\\bin\\bash bin/cibw_before_build_windows.sh' + +export CIBW_REPAIR_WHEEL_COMMAND_WINDOWS='bin\cibw_repair_wheel_command_windows.bat {dest_dir} {wheel}' + +# export CIBW_TEST_COMMAND="python -c 'import flint; print(str(flint.fmpz(2)))'" +export CIBW_TEST_COMMAND="python {project}/test/test.py && python {project}/test/dtest.py" + +# cibuildwheel --platform linux +# cibuildwheel --platform windows +cibuildwheel --platform macos diff --git a/bin/cibw_before_all_macosx_arm64.sh b/bin/cibw_before_all_macosx_arm64.sh new file mode 100755 index 00000000..2fc3dee8 --- /dev/null +++ b/bin/cibw_before_all_macosx_arm64.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +export CPPFLAGS=" --target=arm64-apple-macos11" +export LDFLAGS=" -arch arm64" + +bin/build_dependencies_unix.sh\ + --gmp gmp\ + --host aarch64-apple-darwin\ + --patch-gmp-arm64 diff --git a/bin/cibw_before_all_macosx.sh b/bin/cibw_before_all_macosx_x86_64.sh similarity index 100% rename from bin/cibw_before_all_macosx.sh rename to bin/cibw_before_all_macosx_x86_64.sh diff --git a/bin/patch-arm64.diff b/bin/patch-arm64.diff new file mode 100644 index 00000000..2edabbb1 --- /dev/null +++ b/bin/patch-arm64.diff @@ -0,0 +1,520 @@ + +# HG changeset patch +# User Torbjorn Granlund +# Date 1606685500 -3600 +# Node ID 5f32dbc41afc1f8cd77af1614f0caeb24deb7d7b +# Parent 94c84d919f83ba963ed1809f8e80c7bef32db55c +Avoid the x18 register since it is reserved on Darwin. + +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/aors_n.asm +--- mpn/arm64/aors_n.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/aors_n.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -68,7 +68,7 @@ + EPILOGUE() + PROLOGUE(func_n) + CLRCY +-L(ent): lsr x18, n, #2 ++L(ent): lsr x17, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x7, [up] +@@ -77,7 +77,7 @@ + str x13, [rp],#8 + tbnz n, #1, L(b11) + +-L(b01): cbz x18, L(ret) ++L(b01): cbz x17, L(ret) + ldp x4, x5, [up,#8] + ldp x8, x9, [vp,#8] + sub up, up, #8 +@@ -88,7 +88,7 @@ + ldp x10, x11, [vp,#8] + add up, up, #8 + add vp, vp, #8 +- cbz x18, L(end) ++ cbz x17, L(end) + b L(top) + + L(bx0): tbnz n, #1, L(b10) +@@ -101,7 +101,7 @@ + + L(b10): ldp x6, x7, [up] + ldp x10, x11, [vp] +- cbz x18, L(end) ++ cbz x17, L(end) + + ALIGN(16) + L(top): ldp x4, x5, [up,#16] +@@ -114,8 +114,8 @@ + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + stp x12, x13, [rp],#16 +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x17, x17, #1 ++ cbnz x17, L(top) + + L(end): ADDSUBC x12, x6, x10 + ADDSUBC x13, x7, x11 +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/aorsmul_1.asm +--- mpn/arm64/aorsmul_1.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/aorsmul_1.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -32,10 +32,15 @@ + + include(`../config.m4') + +-C cycles/limb +-C Cortex-A53 9.3-9.8 +-C Cortex-A57 7.0 +-C X-Gene 5.0 ++C addmul_1 submul_1 ++C cycles/limb cycles/limb ++C Cortex-A53 9.3-9.8 9.3-9.8 ++C Cortex-A55 9.0-9.5 9.3-9.8 ++C Cortex-A57 7 7 ++C Cortex-A72 ++C Cortex-A73 6 6 ++C X-Gene 5 5 ++C Apple M1 1.75 1.75 + + C NOTES + C * It is possible to keep the carry chain alive between the addition blocks +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/aorsorrlshC_n.asm +--- mpn/arm64/aorsorrlshC_n.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/aorsorrlshC_n.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -65,14 +65,14 @@ + + ASM_START() + PROLOGUE(func_n) +- lsr x18, n, #2 ++ lsr x6, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x5, [up] + tbnz n, #1, L(b11) + + L(b01): ldr x11, [vp] +- cbz x18, L(1) ++ cbz x6, L(1) + ldp x8, x9, [vp,#8] + lsl x13, x11, #LSH + ADDSUB( x15, x13, x5) +@@ -94,7 +94,7 @@ + ADDSUB( x17, x13, x5) + str x17, [rp],#8 + sub up, up, #8 +- cbz x18, L(end) ++ cbz x6, L(end) + b L(top) + + L(bx0): tbnz n, #1, L(b10) +@@ -107,7 +107,7 @@ + L(b10): CLRRCY( x9) + ldp x10, x11, [vp] + sub up, up, #16 +- cbz x18, L(end) ++ cbz x6, L(end) + + ALIGN(16) + L(top): ldp x4, x5, [up,#16] +@@ -124,8 +124,8 @@ + ADDSUBC(x16, x12, x4) + ADDSUBC(x17, x13, x5) + stp x16, x17, [rp],#16 +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x6, x6, #1 ++ cbnz x6, L(top) + + L(end): ldp x4, x5, [up,#16] + extr x12, x10, x9, #RSH +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/cnd_aors_n.asm +--- mpn/arm64/cnd_aors_n.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/cnd_aors_n.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -65,7 +65,7 @@ + + CLRCY + +- lsr x18, n, #2 ++ lsr x17, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x13, [vp] +@@ -75,7 +75,7 @@ + str x9, [rp] + tbnz n, #1, L(b11) + +-L(b01): cbz x18, L(rt) ++L(b01): cbz x17, L(rt) + ldp x12, x13, [vp,#8] + ldp x10, x11, [up,#8] + sub up, up, #8 +@@ -86,7 +86,7 @@ + L(b11): ldp x12, x13, [vp,#8]! + ldp x10, x11, [up,#8]! + sub rp, rp, #8 +- cbz x18, L(end) ++ cbz x17, L(end) + b L(top) + + L(bx0): ldp x12, x13, [vp] +@@ -99,7 +99,7 @@ + b L(mid) + + L(b10): sub rp, rp, #16 +- cbz x18, L(end) ++ cbz x17, L(end) + + ALIGN(16) + L(top): bic x6, x12, cnd +@@ -116,8 +116,8 @@ + ADDSUBC x9, x11, x7 + ldp x10, x11, [up,#32]! + stp x8, x9, [rp,#32]! +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x17, x17, #1 ++ cbnz x17, L(top) + + L(end): bic x6, x12, cnd + bic x7, x13, cnd +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/logops_n.asm +--- mpn/arm64/logops_n.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/logops_n.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -78,7 +78,7 @@ + + ASM_START() + PROLOGUE(func) +- lsr x18, n, #2 ++ lsr x17, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x7, [up] +@@ -88,7 +88,7 @@ + str x15, [rp],#8 + tbnz n, #1, L(b11) + +-L(b01): cbz x18, L(ret) ++L(b01): cbz x17, L(ret) + ldp x4, x5, [up,#8] + ldp x8, x9, [vp,#8] + sub up, up, #8 +@@ -99,7 +99,7 @@ + ldp x10, x11, [vp,#8] + add up, up, #8 + add vp, vp, #8 +- cbz x18, L(end) ++ cbz x17, L(end) + b L(top) + + L(bx0): tbnz n, #1, L(b10) +@@ -110,7 +110,7 @@ + + L(b10): ldp x6, x7, [up] + ldp x10, x11, [vp] +- cbz x18, L(end) ++ cbz x17, L(end) + + ALIGN(16) + L(top): ldp x4, x5, [up,#16] +@@ -127,8 +127,8 @@ + POSTOP( x12) + POSTOP( x13) + stp x12, x13, [rp],#16 +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x17, x17, #1 ++ cbnz x17, L(top) + + L(end): LOGOP( x12, x6, x10) + LOGOP( x13, x7, x11) +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/lshift.asm +--- mpn/arm64/lshift.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/lshift.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -61,7 +61,7 @@ + add rp, rp_arg, n, lsl #3 + add up, up, n, lsl #3 + sub tnc, xzr, cnt +- lsr x18, n, #2 ++ lsr x17, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x4, [up,#-8] +@@ -69,7 +69,7 @@ + + L(b01): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt +- cbnz x18, L(gt1) ++ cbnz x17, L(gt1) + str x2, [rp,#-8] + ret + L(gt1): ldp x4, x5, [up,#-24] +@@ -89,7 +89,7 @@ + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt +- cbnz x18, L(gt2) ++ cbnz x17, L(gt2) + orr x10, x10, x13 + stp x2, x10, [rp,#-16] + ret +@@ -123,11 +123,11 @@ + orr x11, x12, x2 + stp x10, x11, [rp,#-32]! + PSHIFT x2, x4, cnt +-L(lo0): sub x18, x18, #1 ++L(lo0): sub x17, x17, #1 + L(lo3): NSHIFT x10, x6, tnc + PSHIFT x13, x7, cnt + NSHIFT x12, x7, tnc +- cbnz x18, L(top) ++ cbnz x17, L(top) + + L(end): orr x10, x10, x13 + orr x11, x12, x2 +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/lshiftc.asm +--- mpn/arm64/lshiftc.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/lshiftc.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -61,7 +61,7 @@ + add rp, rp_arg, n, lsl #3 + add up, up, n, lsl #3 + sub tnc, xzr, cnt +- lsr x18, n, #2 ++ lsr x17, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x4, [up,#-8] +@@ -69,7 +69,7 @@ + + L(b01): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt +- cbnz x18, L(gt1) ++ cbnz x17, L(gt1) + mvn x2, x2 + str x2, [rp,#-8] + ret +@@ -90,7 +90,7 @@ + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt +- cbnz x18, L(gt2) ++ cbnz x17, L(gt2) + eon x10, x10, x13 + mvn x2, x2 + stp x2, x10, [rp,#-16] +@@ -125,11 +125,11 @@ + eon x11, x12, x2 + stp x10, x11, [rp,#-32]! + PSHIFT x2, x4, cnt +-L(lo0): sub x18, x18, #1 ++L(lo0): sub x17, x17, #1 + L(lo3): NSHIFT x10, x6, tnc + PSHIFT x13, x7, cnt + NSHIFT x12, x7, tnc +- cbnz x18, L(top) ++ cbnz x17, L(top) + + L(end): eon x10, x10, x13 + eon x11, x12, x2 +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/mul_1.asm +--- mpn/arm64/mul_1.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/mul_1.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -56,7 +56,7 @@ + + PROLOGUE(mpn_mul_1) + adds x4, xzr, xzr C clear register and cy flag +-L(com): lsr x18, n, #2 ++L(com): lsr x17, n, #2 + tbnz n, #0, L(bx1) + + L(bx0): mov x11, x4 +@@ -65,7 +65,7 @@ + L(b10): ldp x4, x5, [up] + mul x8, x4, v0 + umulh x10, x4, v0 +- cbz x18, L(2) ++ cbz x17, L(2) + ldp x6, x7, [up,#16]! + mul x9, x5, v0 + b L(mid)-8 +@@ -80,7 +80,7 @@ + str x9, [rp],#8 + tbnz n, #1, L(b10) + +-L(b01): cbz x18, L(1) ++L(b01): cbz x17, L(1) + + L(b00): ldp x6, x7, [up] + mul x8, x6, v0 +@@ -90,8 +90,8 @@ + adcs x12, x8, x11 + umulh x11, x7, v0 + add rp, rp, #16 +- sub x18, x18, #1 +- cbz x18, L(end) ++ sub x17, x17, #1 ++ cbz x17, L(end) + + ALIGN(16) + L(top): mul x8, x4, v0 +@@ -110,8 +110,8 @@ + stp x12, x13, [rp],#32 + adcs x12, x8, x11 + umulh x11, x7, v0 +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x17, x17, #1 ++ cbnz x17, L(top) + + L(end): mul x8, x4, v0 + adcs x13, x9, x10 +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/rsh1aors_n.asm +--- mpn/arm64/rsh1aors_n.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/rsh1aors_n.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -59,7 +59,7 @@ + + ASM_START() + PROLOGUE(func_n) +- lsr x18, n, #2 ++ lsr x6, n, #2 + + tbz n, #0, L(bx0) + +@@ -69,7 +69,7 @@ + + L(b01): ADDSUB x13, x5, x9 + and x10, x13, #1 +- cbz x18, L(1) ++ cbz x6, L(1) + ldp x4, x5, [up],#48 + ldp x8, x9, [vp],#48 + ADDSUBC x14, x4, x8 +@@ -80,8 +80,8 @@ + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + str x17, [rp], #24 +- sub x18, x18, #1 +- cbz x18, L(end) ++ sub x6, x6, #1 ++ cbz x6, L(end) + b L(top) + + L(1): cset x14, COND +@@ -97,7 +97,7 @@ + ldp x8, x9, [vp],#32 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 +- cbz x18, L(3) ++ cbz x6, L(3) + ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + extr x17, x12, x15, #1 +@@ -117,7 +117,7 @@ + ADDSUB x12, x4, x8 + ADDSUBC x13, x5, x9 + and x10, x12, #1 +- cbz x18, L(2) ++ cbz x6, L(2) + ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + ADDSUBC x14, x4, x8 +@@ -134,8 +134,8 @@ + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + add rp, rp, #16 +- sub x18, x18, #1 +- cbz x18, L(end) ++ sub x6, x6, #1 ++ cbz x6, L(end) + + ALIGN(16) + L(top): ldp x4, x5, [up,#-16] +@@ -152,8 +152,8 @@ + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + stp x16, x17, [rp],#32 +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x6, x6, #1 ++ cbnz x6, L(top) + + L(end): extr x16, x15, x14, #1 + extr x17, x12, x15, #1 +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/rshift.asm +--- mpn/arm64/rshift.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/rshift.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -60,7 +60,7 @@ + PROLOGUE(mpn_rshift) + mov rp, rp_arg + sub tnc, xzr, cnt +- lsr x18, n, #2 ++ lsr x17, n, #2 + tbz n, #0, L(bx0) + + L(bx1): ldr x5, [up] +@@ -68,7 +68,7 @@ + + L(b01): NSHIFT x0, x5, tnc + PSHIFT x2, x5, cnt +- cbnz x18, L(gt1) ++ cbnz x17, L(gt1) + str x2, [rp] + ret + L(gt1): ldp x4, x5, [up,#8] +@@ -89,7 +89,7 @@ + PSHIFT x13, x4, cnt + NSHIFT x10, x5, tnc + PSHIFT x2, x5, cnt +- cbnz x18, L(gt2) ++ cbnz x17, L(gt2) + orr x10, x10, x13 + stp x10, x2, [rp] + ret +@@ -121,11 +121,11 @@ + orr x11, x12, x2 + stp x11, x10, [rp,#32]! + PSHIFT x2, x5, cnt +-L(lo0): sub x18, x18, #1 ++L(lo0): sub x17, x17, #1 + L(lo3): NSHIFT x10, x7, tnc + NSHIFT x12, x6, tnc + PSHIFT x13, x6, cnt +- cbnz x18, L(top) ++ cbnz x17, L(top) + + L(end): orr x10, x10, x13 + orr x11, x12, x2 +diff -r 94c84d919f83 -r 5f32dbc41afc mpn/arm64/sqr_diag_addlsh1.asm +--- mpn/arm64/sqr_diag_addlsh1.asm Sat Nov 28 23:38:32 2020 +0100 ++++ mpn/arm64/sqr_diag_addlsh1.asm Sun Nov 29 22:31:40 2020 +0100 +@@ -47,7 +47,7 @@ + ASM_START() + PROLOGUE(mpn_sqr_diag_addlsh1) + ldr x15, [up],#8 +- lsr x18, n, #1 ++ lsr x14, n, #1 + tbz n, #0, L(bx0) + + L(bx1): adds x7, xzr, xzr +@@ -62,8 +62,8 @@ + ldr x17, [up],#16 + ldp x6, x7, [tp],#32 + umulh x11, x15, x15 +- sub x18, x18, #1 +- cbz x18, L(end) ++ sub x14, x14, #1 ++ cbz x14, L(end) + + ALIGN(16) + L(top): extr x9, x6, x5, #63 +@@ -84,8 +84,8 @@ + extr x8, x5, x4, #63 + stp x12, x13, [rp],#16 + adcs x12, x8, x10 +- sub x18, x18, #1 +- cbnz x18, L(top) ++ sub x14, x14, #1 ++ cbnz x14, L(top) + + L(end): extr x9, x6, x5, #63 + mul x10, x17, x17