|
| 1 | +// Copyright 2021 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +//go:build arm64 |
| 6 | +// +build arm64 |
| 7 | + |
| 8 | +#include "textflag.h" |
| 9 | + |
| 10 | +// func keccakF1600(a *[25]uint64) |
| 11 | +TEXT ·keccakF1600(SB),$0-24 |
| 12 | + MOVD a+0(FP), R0 |
| 13 | + MOVD $round_consts(SB), R1 |
| 14 | + MOVD $24, R2 // counter for loop |
| 15 | + |
| 16 | + VLD1.P 16(R0), [V0.D1, V1.D1] |
| 17 | + VLD1.P 16(R0), [V2.D1, V3.D1] |
| 18 | + VLD1.P 16(R0), [V4.D1, V5.D1] |
| 19 | + VLD1.P 16(R0), [V6.D1, V7.D1] |
| 20 | + VLD1.P 16(R0), [V8.D1, V9.D1] |
| 21 | + VLD1.P 16(R0), [V10.D1, V11.D1] |
| 22 | + VLD1.P 16(R0), [V12.D1, V13.D1] |
| 23 | + VLD1.P 16(R0), [V14.D1, V15.D1] |
| 24 | + VLD1.P 16(R0), [V16.D1, V17.D1] |
| 25 | + VLD1.P 16(R0), [V18.D1, V19.D1] |
| 26 | + VLD1.P 16(R0), [V20.D1, V21.D1] |
| 27 | + VLD1.P 16(R0), [V22.D1, V23.D1] |
| 28 | + VLD1 (R0), [V24.D1] |
| 29 | + |
| 30 | + SUB $192, R0, R0 |
| 31 | + |
| 32 | +loop: |
| 33 | + // theta |
| 34 | + VEOR3 V20.B16, V15.B16, V10.B16, V25.B16 |
| 35 | + VEOR3 V21.B16, V16.B16, V11.B16, V26.B16 |
| 36 | + VEOR3 V22.B16, V17.B16, V12.B16, V27.B16 |
| 37 | + VEOR3 V23.B16, V18.B16, V13.B16, V28.B16 |
| 38 | + VEOR3 V24.B16, V19.B16, V14.B16, V29.B16 |
| 39 | + VEOR3 V25.B16, V5.B16, V0.B16, V25.B16 |
| 40 | + VEOR3 V26.B16, V6.B16, V1.B16, V26.B16 |
| 41 | + VEOR3 V27.B16, V7.B16, V2.B16, V27.B16 |
| 42 | + VEOR3 V28.B16, V8.B16, V3.B16, V28.B16 |
| 43 | + VEOR3 V29.B16, V9.B16, V4.B16, V29.B16 |
| 44 | + |
| 45 | + VRAX1 V27.D2, V25.D2, V30.D2 |
| 46 | + VRAX1 V28.D2, V26.D2, V31.D2 |
| 47 | + VRAX1 V29.D2, V27.D2, V27.D2 |
| 48 | + VRAX1 V25.D2, V28.D2, V28.D2 |
| 49 | + VRAX1 V26.D2, V29.D2, V29.D2 |
| 50 | + |
| 51 | + // theta + rho + Pi |
| 52 | + VXAR $64-1, V30.D2, V1.D2, V25.D2 |
| 53 | + |
| 54 | + VXAR $64-44, V30.D2, V6.D2, V1.D2 |
| 55 | + VXAR $64-20, V28.D2, V9.D2, V6.D2 |
| 56 | + VXAR $64-61, V31.D2, V22.D2, V9.D2 |
| 57 | + VXAR $64-39, V28.D2, V14.D2, V22.D2 |
| 58 | + VXAR $64-18, V29.D2, V20.D2, V14.D2 |
| 59 | + |
| 60 | + VXAR $64-62, V31.D2, V2.D2, V26.D2 |
| 61 | + |
| 62 | + VXAR $64-43, V31.D2, V12.D2, V2.D2 |
| 63 | + VXAR $64-25, V27.D2, V13.D2, V12.D2 |
| 64 | + VXAR $64-8, V28.D2, V19.D2, V13.D2 |
| 65 | + VXAR $64-56, V27.D2, V23.D2, V19.D2 |
| 66 | + VXAR $64-41, V29.D2, V15.D2, V23.D2 |
| 67 | + |
| 68 | + VXAR $64-27, V28.D2, V4.D2, V15.D2 |
| 69 | + |
| 70 | + VXAR $64-14, V28.D2, V24.D2, V28.D2 |
| 71 | + VXAR $64-2, V30.D2, V21.D2, V24.D2 |
| 72 | + VXAR $64-55, V27.D2, V8.D2, V8.D2 |
| 73 | + VXAR $64-45, V30.D2, V16.D2, V4.D2 |
| 74 | + VXAR $64-36, V29.D2, V5.D2, V16.D2 |
| 75 | + |
| 76 | + VXAR $64-28, V27.D2, V3.D2, V5.D2 |
| 77 | + |
| 78 | + VEOR V29.B16, V0.B16, V0.B16 |
| 79 | + |
| 80 | + VXAR $64-21, V27.D2, V18.D2, V27.D2 |
| 81 | + VXAR $64-15, V31.D2, V17.D2, V3.D2 |
| 82 | + VXAR $64-10, V30.D2, V11.D2, V30.D2 |
| 83 | + VXAR $64-6, V31.D2, V7.D2, V31.D2 |
| 84 | + VXAR $64-3, V29.D2, V10.D2, V29.D2 |
| 85 | + |
| 86 | + // chi + iota |
| 87 | + VBCAX V8.B16, V22.B16, V26.B16, V20.B16 |
| 88 | + VBCAX V22.B16, V23.B16, V8.B16, V21.B16 |
| 89 | + VBCAX V23.B16, V24.B16, V22.B16, V22.B16 |
| 90 | + VBCAX V24.B16, V26.B16, V23.B16, V23.B16 |
| 91 | + VBCAX V26.B16, V8.B16, V24.B16, V24.B16 |
| 92 | + |
| 93 | + VLD1R.P 8(R1), [V26.D2] |
| 94 | + |
| 95 | + VBCAX V3.B16, V19.B16, V30.B16, V17.B16 |
| 96 | + VBCAX V19.B16, V15.B16, V3.B16, V18.B16 |
| 97 | + VBCAX V15.B16, V16.B16, V19.B16, V19.B16 |
| 98 | + VBCAX V16.B16, V30.B16, V15.B16, V15.B16 |
| 99 | + VBCAX V30.B16, V3.B16, V16.B16, V16.B16 |
| 100 | + |
| 101 | + VBCAX V31.B16, V12.B16, V25.B16, V10.B16 |
| 102 | + VBCAX V12.B16, V13.B16, V31.B16, V11.B16 |
| 103 | + VBCAX V13.B16, V14.B16, V12.B16, V12.B16 |
| 104 | + VBCAX V14.B16, V25.B16, V13.B16, V13.B16 |
| 105 | + VBCAX V25.B16, V31.B16, V14.B16, V14.B16 |
| 106 | + |
| 107 | + VBCAX V4.B16, V9.B16, V29.B16, V7.B16 |
| 108 | + VBCAX V9.B16, V5.B16, V4.B16, V8.B16 |
| 109 | + VBCAX V5.B16, V6.B16, V9.B16, V9.B16 |
| 110 | + VBCAX V6.B16, V29.B16, V5.B16, V5.B16 |
| 111 | + VBCAX V29.B16, V4.B16, V6.B16, V6.B16 |
| 112 | + |
| 113 | + VBCAX V28.B16, V0.B16, V27.B16, V3.B16 |
| 114 | + VBCAX V0.B16, V1.B16, V28.B16, V4.B16 |
| 115 | + VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (for chi part) |
| 116 | + VBCAX V2.B16, V27.B16, V1.B16, V1.B16 |
| 117 | + VBCAX V27.B16, V28.B16, V2.B16, V2.B16 |
| 118 | + |
| 119 | + VEOR V26.B16, V0.B16, V0.B16 // iota |
| 120 | + |
| 121 | + SUBS $1, R2, R2 |
| 122 | + BNE loop |
| 123 | + |
| 124 | + VST1.P [V0.D1, V1.D1], 16(R0) |
| 125 | + VST1.P [V2.D1, V3.D1], 16(R0) |
| 126 | + VST1.P [V4.D1, V5.D1], 16(R0) |
| 127 | + VST1.P [V6.D1, V7.D1], 16(R0) |
| 128 | + VST1.P [V8.D1, V9.D1], 16(R0) |
| 129 | + VST1.P [V10.D1, V11.D1], 16(R0) |
| 130 | + VST1.P [V12.D1, V13.D1], 16(R0) |
| 131 | + VST1.P [V14.D1, V15.D1], 16(R0) |
| 132 | + VST1.P [V16.D1, V17.D1], 16(R0) |
| 133 | + VST1.P [V18.D1, V19.D1], 16(R0) |
| 134 | + VST1.P [V20.D1, V21.D1], 16(R0) |
| 135 | + VST1.P [V22.D1, V23.D1], 16(R0) |
| 136 | + VST1 [V24.D1], (R0) |
| 137 | + |
| 138 | + RET |
| 139 | + |
| 140 | +DATA round_consts+0x00(SB)/8, $0x0000000000000001 |
| 141 | +DATA round_consts+0x08(SB)/8, $0x0000000000008082 |
| 142 | +DATA round_consts+0x10(SB)/8, $0x800000000000808a |
| 143 | +DATA round_consts+0x18(SB)/8, $0x8000000080008000 |
| 144 | +DATA round_consts+0x20(SB)/8, $0x000000000000808b |
| 145 | +DATA round_consts+0x28(SB)/8, $0x0000000080000001 |
| 146 | +DATA round_consts+0x30(SB)/8, $0x8000000080008081 |
| 147 | +DATA round_consts+0x38(SB)/8, $0x8000000000008009 |
| 148 | +DATA round_consts+0x40(SB)/8, $0x000000000000008a |
| 149 | +DATA round_consts+0x48(SB)/8, $0x0000000000000088 |
| 150 | +DATA round_consts+0x50(SB)/8, $0x0000000080008009 |
| 151 | +DATA round_consts+0x58(SB)/8, $0x000000008000000a |
| 152 | +DATA round_consts+0x60(SB)/8, $0x000000008000808b |
| 153 | +DATA round_consts+0x68(SB)/8, $0x800000000000008b |
| 154 | +DATA round_consts+0x70(SB)/8, $0x8000000000008089 |
| 155 | +DATA round_consts+0x78(SB)/8, $0x8000000000008003 |
| 156 | +DATA round_consts+0x80(SB)/8, $0x8000000000008002 |
| 157 | +DATA round_consts+0x88(SB)/8, $0x8000000000000080 |
| 158 | +DATA round_consts+0x90(SB)/8, $0x000000000000800a |
| 159 | +DATA round_consts+0x98(SB)/8, $0x800000008000000a |
| 160 | +DATA round_consts+0xA0(SB)/8, $0x8000000080008081 |
| 161 | +DATA round_consts+0xA8(SB)/8, $0x8000000000008080 |
| 162 | +DATA round_consts+0xB0(SB)/8, $0x0000000080000001 |
| 163 | +DATA round_consts+0xB8(SB)/8, $0x8000000080008008 |
| 164 | +GLOBL round_consts(SB), (8+16), $192 |
0 commit comments