|
| 1 | +.intel_syntax noprefix |
| 2 | + |
| 3 | +.section .rodata |
| 4 | + two: .double 2.0 |
| 5 | + one: .double 1.0 |
| 6 | + two_pi: .double -6.28318530718 |
| 7 | + rand_max: .long 4290772992 |
| 8 | + .long 1105199103 |
| 9 | + fmt: .string "%g\n" |
| 10 | + |
| 11 | +.section .text |
| 12 | + .global main |
| 13 | + .extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2 |
| 14 | + |
| 15 | +# rdi - array ptr |
| 16 | +# rsi - array size |
| 17 | +dft: |
| 18 | + push rbx |
| 19 | + push r12 |
| 20 | + push r13 |
| 21 | + push r14 |
| 22 | + push r15 |
| 23 | + mov r12, rdi # Save parameters |
| 24 | + mov r13, rsi |
| 25 | + sub rsp, r13 # Make a double complex array |
| 26 | + xor r14, r14 # Set index to 0 |
| 27 | +dft_loop_i: |
| 28 | + cmp r14, r13 # Check if index is equal to array size |
| 29 | + je dft_end_i |
| 30 | + lea rax, [rsp + r14] # Set tmp array to zero at r14 |
| 31 | + mov QWORD PTR [rax], 0 |
| 32 | + mov QWORD PTR [rax + 8], 0 |
| 33 | + xor r15, r15 # Set second index to 0 |
| 34 | +dft_loop_j: |
| 35 | + cmp r15, r13 # Check if the index is equal to array size |
| 36 | + je dft_end_j |
| 37 | + movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N |
| 38 | + mov rax, r14 |
| 39 | + imul rax, r15 |
| 40 | + shr rax, 4 |
| 41 | + cvtsi2sdq xmm2, rax |
| 42 | + mulsd xmm1, xmm2 |
| 43 | + cvtsi2sdq xmm2, r13 |
| 44 | + divsd xmm1, xmm2 |
| 45 | + pxor xmm0, xmm0 # Set xmm0 to 0 |
| 46 | + call cexp |
| 47 | + lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N) |
| 48 | + movsd xmm2, QWORD PTR [rax] |
| 49 | + movsd xmm3, QWORD PTR [rax + 8] |
| 50 | + call __muldc3 |
| 51 | + lea rax, [rsp + r14] |
| 52 | + movsd xmm6, QWORD PTR [rax] # Sum to tmp array |
| 53 | + movsd xmm7, QWORD PTR [rax + 8] |
| 54 | + addsd xmm6, xmm0 |
| 55 | + addsd xmm7, xmm1 |
| 56 | + movsd QWORD PTR [rax], xmm6 # Save to tmp array |
| 57 | + movsd QWORD PTR [rax + 8], xmm7 |
| 58 | + add r15, 16 |
| 59 | + jmp dft_loop_j |
| 60 | +dft_end_j: |
| 61 | + add r14, 16 |
| 62 | + jmp dft_loop_i |
| 63 | +dft_end_i: |
| 64 | + mov rdi, r12 # Move tmp array to array ptr |
| 65 | + mov rsi, rsp |
| 66 | + mov rdx, r13 |
| 67 | + call memcpy |
| 68 | + add rsp, r13 |
| 69 | + pop r15 |
| 70 | + pop r14 |
| 71 | + pop r13 |
| 72 | + pop r12 |
| 73 | + pop rbx |
| 74 | + ret |
| 75 | + |
| 76 | +# rdi - array ptr |
| 77 | +# rsi - array size |
| 78 | +cooley_tukey: |
| 79 | + cmp rsi, 16 # Check if size if greater then 1 |
| 80 | + jle cooley_tukey_return |
| 81 | + push rbx |
| 82 | + push r12 |
| 83 | + push r13 |
| 84 | + push r14 |
| 85 | + push r15 |
| 86 | + mov r12, rdi # Save parameters |
| 87 | + mov r13, rsi |
| 88 | + mov r14, rsi # Save N / 2 |
| 89 | + shr r14, 1 |
| 90 | + sub rsp, r14 # Make a tmp array |
| 91 | + xor r15, r15 |
| 92 | + mov rbx, r12 |
| 93 | +cooley_tukey_spliting: |
| 94 | + cmp r15, r14 |
| 95 | + je cooley_tukey_split |
| 96 | + lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array |
| 97 | + movaps xmm0, XMMWORD PTR [rax + 16] |
| 98 | + movaps xmm1, XMMWORD PTR [rax] |
| 99 | + movaps XMMWORD PTR [rsp + r15], xmm0 |
| 100 | + movaps XMMWORD PTR [rbx], xmm1 |
| 101 | + add rbx, 16 |
| 102 | + add r15, 16 |
| 103 | + jmp cooley_tukey_spliting |
| 104 | +cooley_tukey_split: |
| 105 | + mov rax, rsp |
| 106 | + lea rdi, [r12 + r13] |
| 107 | +cooley_tukey_mov_data: |
| 108 | + cmp rbx, rdi |
| 109 | + je cooley_tukey_moved |
| 110 | + movaps xmm0, XMMWORD PTR [rax] |
| 111 | + movaps XMMWORD PTR [rbx], xmm0 |
| 112 | + add rbx, 16 |
| 113 | + add rax, 16 |
| 114 | + jmp cooley_tukey_mov_data |
| 115 | +cooley_tukey_moved: |
| 116 | + add rsp, r14 |
| 117 | + mov rdi, r12 # Makking a recursive call |
| 118 | + mov rsi, r14 |
| 119 | + call cooley_tukey |
| 120 | + lea rdi, [r12 + r14] # Makking a recursive call |
| 121 | + mov rsi, r14 |
| 122 | + call cooley_tukey |
| 123 | + lea rbx, [r12 + r14] |
| 124 | + mov r14, rbx |
| 125 | + mov r15, r12 |
| 126 | +cooley_tukey_loop: |
| 127 | + cmp r15, rbx |
| 128 | + je cooley_tukey_end |
| 129 | + pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N) |
| 130 | + movsd xmm1, two_pi |
| 131 | + mov rax, r14 |
| 132 | + sub rax, rbx |
| 133 | + cvtsi2sdq xmm2, rax |
| 134 | + cvtsi2sdq xmm3, r13 |
| 135 | + divsd xmm2, xmm3 |
| 136 | + mulsd xmm1, xmm2 |
| 137 | + call cexp |
| 138 | + movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2] |
| 139 | + movq xmm3, QWORD PTR [r14 + 8] |
| 140 | + call __muldc3 |
| 141 | + movq xmm2, QWORD PTR [r15] |
| 142 | + movq xmm3, QWORD PTR [r15 + 8] |
| 143 | + subsd xmm2, xmm0 |
| 144 | + subsd xmm3, xmm1 |
| 145 | + movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2] |
| 146 | + movq QWORD PTR [r14 + 8], xmm3 |
| 147 | + movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i] |
| 148 | + movq xmm1, QWORD PTR [r15 + 8] |
| 149 | + subsd xmm2, xmm0 |
| 150 | + subsd xmm3, xmm1 |
| 151 | + subsd xmm0, xmm2 |
| 152 | + subsd xmm1, xmm3 |
| 153 | + movq QWORD PTR [r15], xmm0 |
| 154 | + movq QWORD PTR [r15 + 8], xmm1 |
| 155 | + add r14, 16 |
| 156 | + add r15, 16 |
| 157 | + jmp cooley_tukey_loop |
| 158 | +cooley_tukey_end: |
| 159 | + pop r15 |
| 160 | + pop r14 |
| 161 | + pop r13 |
| 162 | + pop r12 |
| 163 | + pop rbx |
| 164 | +cooley_tukey_return: |
| 165 | + ret |
| 166 | + |
| 167 | +# rdi - array ptr |
| 168 | +# rsi - array size |
| 169 | +bit_reverse: |
| 170 | + push rbx |
| 171 | + push r12 |
| 172 | + push r13 |
| 173 | + push r14 |
| 174 | + push r15 |
| 175 | + mov r12, rdi # Save parameters |
| 176 | + mov r13, rsi |
| 177 | + shr r13, 4 |
| 178 | + xor r14, r14 # Loop through all entries |
| 179 | +bit_reverse_entries: |
| 180 | + cmp r14, r13 |
| 181 | + je bit_reverse_return |
| 182 | + cvtsi2sdq xmm0, r13 # Calculating the number of bit in N |
| 183 | + call log2 |
| 184 | + cvttsd2si rcx, xmm0 |
| 185 | + mov rdi, 1 # Calculating (1 << log2(N)) - 1 |
| 186 | + sal edi, cl |
| 187 | + sub edi, 1 |
| 188 | + sub ecx, 1 |
| 189 | + mov rax, r14 |
| 190 | + mov r15, r14 |
| 191 | +bit_reverse_loop: |
| 192 | + sar r15 # Check if r15 is 0 |
| 193 | + je bit_reverse_reversed |
| 194 | + sal rax, 1 # Calculating (rax << 1) | (r15 & 1) |
| 195 | + mov rsi, r15 |
| 196 | + and rsi, 1 |
| 197 | + or rax, rsi |
| 198 | + sub ecx, 1 # Decrement bit count |
| 199 | + jmp bit_reverse_loop |
| 200 | +bit_reverse_reversed: |
| 201 | + sal eax, cl # Calculate (rax << rcx) & (1 << bit count) |
| 202 | + and rax, rdi |
| 203 | + cmp rax, r14 # Check if rax is greater then r14 |
| 204 | + jle bit_reverse_no_swap # If so then swap entries |
| 205 | + shl rax, 4 # Times index by 16 to get bytes to entry |
| 206 | + shl r14, 4 |
| 207 | + movaps xmm0, XMMWORD PTR [r12 + rax] |
| 208 | + movaps xmm1, XMMWORD PTR [r12 + r14] |
| 209 | + movaps XMMWORD PTR [r12 + rax], xmm1 |
| 210 | + movaps XMMWORD PTR [r12 + r14], xmm0 |
| 211 | + shr r14, 4 |
| 212 | +bit_reverse_no_swap: |
| 213 | + add r14, 1 |
| 214 | + jmp bit_reverse_entries |
| 215 | +bit_reverse_return: |
| 216 | + pop r15 |
| 217 | + pop r14 |
| 218 | + pop r13 |
| 219 | + pop r12 |
| 220 | + pop rbx |
| 221 | + ret |
| 222 | + |
| 223 | +# rdi - array ptr |
| 224 | +# rsi - array size |
| 225 | +iterative_cooley_tukey: |
| 226 | + push r12 |
| 227 | + push r13 |
| 228 | + push r14 |
| 229 | + push r15 |
| 230 | + push rbx |
| 231 | + sub rsp, 48 |
| 232 | + mov r12, rdi |
| 233 | + mov r13, rsi |
| 234 | + call bit_reverse # Bit reversing array |
| 235 | + sar r13, 4 # Calculate log2(N) |
| 236 | + cvtsi2sdq xmm0, r13 |
| 237 | + call log2 |
| 238 | + cvttsd2si rax, xmm0 |
| 239 | + mov QWORD PTR [rsp], rax # Save it to the stack |
| 240 | + mov r14, 1 |
| 241 | +iter_ct_loop_i: |
| 242 | + cmp r14, rax # Check if r14 is greater then log2(N) |
| 243 | + jg iter_ct_end_i |
| 244 | + movsd xmm0, two # Calculate stride = 2^(r14) |
| 245 | + cvtsi2sdq xmm1, r14 |
| 246 | + call pow |
| 247 | + cvttsd2si r10, xmm0 |
| 248 | + mov QWORD PTR [rsp + 40], r10# move stride to stack |
| 249 | + movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride) |
| 250 | + divsd xmm1, xmm0 |
| 251 | + pxor xmm0, xmm0 |
| 252 | + call cexp |
| 253 | + movq QWORD PTR [rsp + 8], xmm0 # Save it to stack |
| 254 | + movq QWORD PTR [rsp + 16], xmm1 |
| 255 | + xor r15, r15 |
| 256 | +iter_ct_loop_j: |
| 257 | + cmp r15, r13 # Check if r15 is less then array size |
| 258 | + je iter_ct_end_j |
| 259 | + movsd xmm4, one # Save 1 + 0i to stack |
| 260 | + pxor xmm5, xmm5 |
| 261 | + movsd QWORD PTR [rsp + 24], xmm4 |
| 262 | + movsd QWORD PTR [rsp + 32], xmm5 |
| 263 | + xor rbx, rbx |
| 264 | + mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2 |
| 265 | + sar rax, 1 |
| 266 | +iter_ct_loop_k: |
| 267 | + cmp rbx, rax # Check if rbx is less then stride / 2 |
| 268 | + je iter_ct_end_k |
| 269 | + mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j] |
| 270 | + add r8, rbx |
| 271 | + sal r8, 4 |
| 272 | + mov r9, QWORD PTR [rsp + 40] |
| 273 | + sal r9, 3 |
| 274 | + add r9, r8 |
| 275 | + lea r9, [r12 + r9] |
| 276 | + lea r8, [r12 + r8] |
| 277 | + movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2] |
| 278 | + movsd xmm1, QWORD PTR [r9 + 8] |
| 279 | + movsd xmm2, QWORD PTR [rsp + 24] |
| 280 | + movsd xmm3, QWORD PTR [rsp + 32] |
| 281 | + call __muldc3 |
| 282 | + movsd xmm2, QWORD PTR [r8] |
| 283 | + movsd xmm3, QWORD PTR [r8 + 8] |
| 284 | + subsd xmm2, xmm0 |
| 285 | + subsd xmm3, xmm1 |
| 286 | + movsd QWORD PTR [r9], xmm2 # Saving answer |
| 287 | + movsd QWORD PTR [r9 + 8], xmm3 |
| 288 | + movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j]) |
| 289 | + movsd xmm1, QWORD PTR [r8 + 8] |
| 290 | + subsd xmm2, xmm0 |
| 291 | + subsd xmm3, xmm1 |
| 292 | + subsd xmm0, xmm2 |
| 293 | + subsd xmm1, xmm3 |
| 294 | + movsd QWORD PTR [r8], xmm0 # Saving answer |
| 295 | + movsd QWORD PTR [r8 + 8], xmm1 |
| 296 | + movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w |
| 297 | + movsd xmm1, QWORD PTR [rsp + 32] |
| 298 | + movsd xmm2, QWORD PTR [rsp + 8] |
| 299 | + movsd xmm3, QWORD PTR [rsp + 16] |
| 300 | + call __muldc3 |
| 301 | + movsd QWORD PTR [rsp + 24], xmm0 # Saving answer |
| 302 | + movsd QWORD PTR [rsp + 32], xmm1 |
| 303 | + add rbx, 1 |
| 304 | + mov rax, QWORD PTR [rsp + 40] |
| 305 | + sar rax, 1 |
| 306 | + jmp iter_ct_loop_k |
| 307 | +iter_ct_end_k: |
| 308 | + add r15, QWORD PTR [rsp + 40] |
| 309 | + jmp iter_ct_loop_j |
| 310 | +iter_ct_end_j: |
| 311 | + add r14, 1 |
| 312 | + mov rax, QWORD PTR [rsp] |
| 313 | + jmp iter_ct_loop_i |
| 314 | +iter_ct_end_i: |
| 315 | + add rsp, 48 |
| 316 | + pop rbx |
| 317 | + pop r15 |
| 318 | + pop r14 |
| 319 | + pop r13 |
| 320 | + pop r12 |
| 321 | + ret |
| 322 | + |
| 323 | +# rdi - array a ptr |
| 324 | +# rsi - array b ptr |
| 325 | +# rdx - array size |
| 326 | +approx: |
| 327 | + push r12 |
| 328 | + push r13 |
| 329 | + push r14 |
| 330 | + push r15 |
| 331 | + mov r12, rdi |
| 332 | + mov r13, rsi |
| 333 | + mov r14, rdx |
| 334 | + lea r15, [rdi + rdx] |
| 335 | + sub rsp, 8 |
| 336 | +approx_loop: |
| 337 | + cmp r12, r15 |
| 338 | + je approx_return |
| 339 | + movsd xmm0, QWORD PTR[r13] |
| 340 | + movsd xmm1, QWORD PTR[r13 + 8] |
| 341 | + call cabs |
| 342 | + movsd QWORD PTR [rsp], xmm0 |
| 343 | + movsd xmm0, QWORD PTR[r12] |
| 344 | + movsd xmm1, QWORD PTR[r12 + 8] |
| 345 | + call cabs |
| 346 | + movsd xmm1, QWORD PTR [rsp] |
| 347 | + subsd xmm0, xmm1 |
| 348 | + mov rdi, OFFSET fmt |
| 349 | + mov rax, 1 |
| 350 | + call printf |
| 351 | + add r12, 16 |
| 352 | + add r13, 16 |
| 353 | + jmp approx_loop |
| 354 | +approx_return: |
| 355 | + add rsp, 8 |
| 356 | + pop r15 |
| 357 | + pop r14 |
| 358 | + pop r13 |
| 359 | + pop r12 |
| 360 | + ret |
| 361 | + |
| 362 | +main: |
| 363 | + push r12 |
| 364 | + sub rsp, 2048 |
| 365 | + mov rdi, 0 |
| 366 | + call time |
| 367 | + mov edi, eax |
| 368 | + call srand |
| 369 | + lea r12, [rsp + 1024] |
| 370 | +loop: |
| 371 | + cmp r12, rsp |
| 372 | + je end_loop |
| 373 | + sub r12, 16 |
| 374 | + call rand |
| 375 | + cvtsi2sd xmm0, rax |
| 376 | + divsd xmm0, rand_max |
| 377 | + lea rax, [r12 + 1024] |
| 378 | + movsd QWORD PTR [r12], xmm0 |
| 379 | + movsd QWORD PTR [rax], xmm0 |
| 380 | + mov QWORD PTR [r12 + 8], 0 |
| 381 | + mov QWORD PTR [rax + 8], 0 |
| 382 | + jmp loop |
| 383 | +end_loop: |
| 384 | + mov rdi, rsp |
| 385 | + mov rsi, 1024 |
| 386 | + call iterative_cooley_tukey |
| 387 | + lea rdi, [rsp + 1024] |
| 388 | + mov rsi, 1024 |
| 389 | + call cooley_tukey |
| 390 | + mov rdi, rsp |
| 391 | + lea rsi, [rsp + 1024] |
| 392 | + mov rdx, 1024 |
| 393 | + call approx |
| 394 | + xor rax, rax |
| 395 | + add rsp, 2048 |
| 396 | + pop r12 |
| 397 | + ret |
| 398 | + |
| 399 | + |
0 commit comments