@@ -98,103 +98,59 @@ TEXT ·shrVU(SB), NOSPLIT, $0
98
98
99
99
// func mulAddVWW(z , x [] Word , y , r Word) (c Word)
100
100
TEXT ·mulAddVWW(SB) , NOSPLIT , $ 0
101
- MOVD z + 0 (FP) , R10
102
- MOVD x + 24 (FP) , R8
103
- MOVD y + 48 (FP) , R9
104
- MOVD r + 56 (FP) , R4 // c = r
105
- MOVD z_len + 8 (FP) , R11
106
- MOVD $ 0 , R3 // i = 0
107
- MOVD $ 8 , R18
108
- MOVD $ 1 , R19
109
-
110
- JMP e5
111
-
112
- l5:
113
- MULLD R18 , R3 , R5
114
- MOVD ( R8 )(R5) , R20
115
- MULLD R9 , R20 , R6
116
- MULHDU R9 , R20 , R7
117
- ADDC R4 , R6
118
- ADDZE R7
119
- MOVD R6 , ( R10 )(R5)
120
- MOVD R7 , R4
121
- ADD R19 , R3
101
+ MOVD z + 0 (FP) , R10 // R10 = z []
102
+ MOVD x + 24 (FP) , R8 // R8 = x []
103
+ MOVD y + 48 (FP) , R9 // R9 = y
104
+ MOVD r + 56 (FP) , R4 // R4 = r = c
105
+ MOVD z_len + 8 (FP) , R11 // R11 = z_len
122
106
123
- e5:
124
- CMP R3 , R11
125
- BLT l5
107
+ MOVD R0 , R3 // R3 will be the index register
108
+ CMP R0 , R11
109
+ MOVD R11 , CTR // Initialize loop counter
110
+ BEQ done
126
111
112
+ loop :
113
+ MOVD ( R8 )(R3) , R20 // x [ i ]
114
+ MULLD R9 , R20 , R6 // R6 = z0 = Low - order(x [ i ]* y)
115
+ MULHDU R9 , R20 , R7 // R7 = z1 = High - order(x [ i ]* y)
116
+ ADDC R4 , R6 // Compute sum for z1 and z0
117
+ ADDZE R7
118
+ MOVD R6 , ( R10 )(R3) // z [ i ]
119
+ MOVD R7 , R4 // c
120
+ ADD $ 8 , R3
121
+ BC 16 , 0 , loop // bdnz
122
+
123
+ done:
127
124
MOVD R4 , c + 64 (FP)
128
125
RET
129
126
130
127
// func addMulVVW(z , x [] Word , y Word) (c Word)
131
128
TEXT ·addMulVVW(SB) , NOSPLIT , $ 0
132
- MOVD z + 0 (FP) , R10
133
- MOVD x + 24 (FP) , R8
134
- MOVD y + 48 (FP) , R9
135
- MOVD z_len + 8 (FP) , R22
136
-
137
- MOVD $ 0 , R5 // i = 0
138
- MOVD $ 0 , R4 // c = 0
139
- MOVD $ 8 , R28
140
- MOVD $ - 2 , R23
141
- AND R22 , R23 // mask the last bit of z.len
142
- MOVD $ 2 , R24
143
- CMP R23 , R24
144
- BGE unrolled
145
- JMP end
146
-
147
- unrolled:
148
- MOVD $ 8 , R19 // no (RA)(RB * 8 ) on power
149
- MULLD R5 , R19
150
- MOVD ( R10 )(R19) , R11 // R11 = z [ i ]
151
- MOVD ( R8 )(R19) , R16 // R16 = x [ i ]
152
- ADD R28 , R19 , R25
153
- MOVD ( R10 )(R25) , R17
154
- MOVD ( R8 )(R25) , R18
155
-
156
- MULLD R9 , R16 , R12
157
- MULHDU R9 , R16 , R14
158
- MULLD R9 , R18 , R6
159
- MULHDU R9 , R18 , R7
160
- ADDC R4 , R12
161
- ADDZE R14
162
- ADDC R11 , R12 // z [ i ] = (x [ i ]* y) + z [ i ] + carry
163
- ADDZE R14 // carry = high order bits + add carry
164
- MOVD R12 , ( R10 )(R19)
165
- ADDC R14 , R6
166
- ADDZE R7
167
- ADDC R17 , R6
168
- ADDZE R7
169
- MOVD R6 , ( R10 )(R25)
170
- MOVD R7 , R4
129
+ MOVD z + 0 (FP) , R10 // R10 = z []
130
+ MOVD x + 24 (FP) , R8 // R8 = x []
131
+ MOVD y + 48 (FP) , R9 // R9 = y
132
+ MOVD z_len + 8 (FP) , R22 // R22 = z_len
171
133
172
- ADD R24 , R5
173
- CMP R5 , R23
174
- BLT unrolled
175
- JMP end
134
+ MOVD R0 , R3 // R3 will be the index register
135
+ CMP R0 , R22
136
+ MOVD R0 , R4 // R4 = c = 0
137
+ MOVD R22 , CTR // Initialize loop counter
138
+ BEQ done
176
139
177
140
loop :
178
- MOVD $ 8 , R19
179
- MULLD R5 , R19
180
- MOVD ( R10 )(R19) , R11
181
- MOVD ( R8 )(R19) , R16
182
- MULLD R9 , R16 , R12
183
- MULHDU R9 , R16 , R14
184
- ADDC R4 , R12
185
- ADDZE R14
186
- ADDC R11 , R12
187
- ADDZE R14
188
- MOVD R12 , ( R10 )(R19)
189
- MOVD R14 , R4
190
-
191
- MOVD $ 1 , R15
192
- ADD R15 , R5
193
-
194
- end:
195
- CMP R5 , R22
196
- BLT loop
141
+ MOVD ( R8 )(R3) , R20 // Load x [ i ]
142
+ MOVD ( R10 )(R3) , R21 // Load z [ i ]
143
+ MULLD R9 , R20 , R6 // R6 = Low - order(x [ i ]* y)
144
+ MULHDU R9 , R20 , R7 // R7 = High - order(x [ i ]* y)
145
+ ADDC R21 , R6 // R6 = z0
146
+ ADDZE R7 // R7 = z1
147
+ ADDC R4 , R6 // R6 = z0 + c + 0
148
+ ADDZE R7 , R4 // c + = z1
149
+ MOVD R6 , ( R10 )(R3) // Store z [ i ]
150
+ ADD $ 8 , R3
151
+ BC 16 , 0 , loop // bdnz
197
152
153
+ done:
198
154
MOVD R4 , c + 56 (FP)
199
155
RET
200
156
0 commit comments