From 60816d43004a0b3683fb31360562f9ec595f002d Mon Sep 17 00:00:00 2001 From: Gathros <6323830+Gathros@users.noreply.github.com> Date: Mon, 5 Nov 2018 10:40:16 +0000 Subject: [PATCH 1/4] Adding fft.s --- contents/cooley_tukey/code/asm-x64/fft.s | 398 +++++++++++++++++++++++ 1 file changed, 398 insertions(+) create mode 100644 contents/cooley_tukey/code/asm-x64/fft.s diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s new file mode 100644 index 000000000..1470e87ce --- /dev/null +++ b/contents/cooley_tukey/code/asm-x64/fft.s @@ -0,0 +1,398 @@ +.intel_syntax noprefix + +.section .rodata + two: .double 2.0 + one: .double 1.0 + two_pi: .double -6.28318530718 + rand_max: .long 4290772992 + .long 1105199103 + fmt: .string "%g\n" + +.section .text + .global main + .extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2 + +# rdi - array ptr +# rsi - array size +dft: + push rbx + push r12 + push r13 + push r14 + push r15 + mov r12, rdi # Save parameters + mov r13, rsi + sub rsp, r13 # Make a double complex array + xor r14, r14 # Set index to 0 +dft_loop_i: + cmp r14, r13 # Check if index is equal to array size + je dft_end_i + lea rax, [rsp + r14] # Set tmp array to zero at r14 + mov QWORD PTR [rax], 0 + mov QWORD PTR [rax + 8], 0 + xor r15, r15 # Set second index to 0 +dft_loop_j: + cmp r15, r13 # Check if the index is equal to array size + je dft_end_j + movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N + mov rax, r14 + imul rax, r15 + shr rax, 4 + cvtsi2sdq xmm2, rax + mulsd xmm1, xmm2 + cvtsi2sdq xmm2, r13 + divsd xmm1, xmm2 + pxor xmm0, xmm0 # Set xmm0 to 0 + call cexp + lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N) + movsd xmm2, QWORD PTR [rax] + movsd xmm3, QWORD PTR [rax + 8] + call __muldc3 + lea rax, [rsp + r14] + movsd xmm6, QWORD PTR [rax] # Sum to tmp array + movsd xmm7, QWORD PTR [rax + 8] + addsd xmm6, xmm0 + addsd xmm7, xmm1 + movsd QWORD PTR [rax], xmm6 # Save to tmp array + movsd QWORD PTR [rax + 8], xmm7 + add r15, 16 + jmp dft_loop_j +dft_end_j: + add r14, 16 + jmp dft_loop_i +dft_end_i: + mov rdi, r12 # Move tmp array to array ptr + mov rsi, rsp + mov rdx, r13 + call memcpy + add rsp, r13 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret + +# rdi - array ptr +# rsi - array size +cooley_tukey: + cmp rsi, 16 # Check if size if greater then 1 + jle cooley_tukey_return + push rbx + push r12 + push r13 + push r14 + push r15 + mov r12, rdi # Save parameters + mov r13, rsi + mov r14, rsi # Save N / 2 + shr r14, 1 + sub rsp, r14 # Make a tmp array + xor r15, r15 + mov rbx, r12 +cooley_tukey_spliting: + cmp r15, r14 + je cooley_tukey_split + lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array + movaps xmm0, XMMWORD PTR [rax + 16] + movaps xmm1, XMMWORD PTR [rax] + movaps XMMWORD PTR [rsp + r15], xmm0 + movaps XMMWORD PTR [rbx], xmm1 + add rbx, 16 + add r15, 16 + jmp cooley_tukey_spliting +cooley_tukey_split: + mov rax, rsp + lea rdi, [r12 + r13] +cooley_tukey_mov_data: + cmp rbx, rdi + je cooley_tukey_moved + movaps xmm0, XMMWORD PTR [rax] + movaps XMMWORD PTR [rbx], xmm0 + add rbx, 16 + add rax, 16 + jmp cooley_tukey_mov_data +cooley_tukey_moved: + add rsp, r14 + mov rdi, r12 # Makking a recursive call + mov rsi, r14 + call cooley_tukey + lea rdi, [r12 + r14] # Makking a recursive call + mov rsi, r14 + call cooley_tukey + lea rbx, [r12 + r14] + mov r14, rbx + mov r15, r12 +cooley_tukey_loop: + cmp r15, rbx + je cooley_tukey_end + pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N) + movsd xmm1, two_pi + mov rax, r14 + sub rax, rbx + cvtsi2sdq xmm2, rax + cvtsi2sdq xmm3, r13 + divsd xmm2, xmm3 + mulsd xmm1, xmm2 + call cexp + movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2] + movq xmm3, QWORD PTR [r14 + 8] + call __muldc3 + movq xmm2, QWORD PTR [r15] + movq xmm3, QWORD PTR [r15 + 8] + subsd xmm2, xmm0 + subsd xmm3, xmm1 + movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2] + movq QWORD PTR [r14 + 8], xmm3 + movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i] + movq xmm1, QWORD PTR [r15 + 8] + subsd xmm2, xmm0 + subsd xmm3, xmm1 + subsd xmm0, xmm2 + subsd xmm1, xmm3 + movq QWORD PTR [r15], xmm0 + movq QWORD PTR [r15 + 8], xmm1 + add r14, 16 + add r15, 16 + jmp cooley_tukey_loop +cooley_tukey_end: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx +cooley_tukey_return: + ret + +# rdi - array ptr +# rsi - array size +bit_reverse: + push rbx + push r12 + push r13 + push r14 + push r15 + mov r12, rdi # Save parameters + mov r13, rsi + shr r13, 4 + xor r14, r14 # Loop through all entries +bit_reverse_entries: + cmp r14, r13 + je bit_reverse_return + cvtsi2sdq xmm0, r13 # Calculating the number of bit in N + call log2 + cvttsd2si rcx, xmm0 + mov rdi, 1 # Calculating (1 << log2(N)) - 1 + sal edi, cl + sub edi, 1 + sub ecx, 1 + mov rax, r14 + mov r15, r14 +bit_reverse_loop: + sar r15 # Check if r15 is 0 + je bit_reverse_reversed + sal rax, 1 # Calculating (rax << 1) | (r15 & 1) + mov rsi, r15 + and rsi, 1 + or rax, rsi + sub ecx, 1 # Decrement bit count + jmp bit_reverse_loop +bit_reverse_reversed: + sal eax, cl # Calculate (rax << rcx) & (1 << bit count) + and rax, rdi + cmp rax, r14 # Check if rax is greater then r14 + jle bit_reverse_no_swap # If so then swap entries + shl rax, 4 # Times index by 16 to get bytes to entry + shl r14, 4 + movaps xmm0, XMMWORD PTR [r12 + rax] + movaps xmm1, XMMWORD PTR [r12 + r14] + movaps XMMWORD PTR [r12 + rax], xmm1 + movaps XMMWORD PTR [r12 + r14], xmm0 + shr r14, 4 +bit_reverse_no_swap: + add r14, 1 + jmp bit_reverse_entries +bit_reverse_return: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret + +# rdi - array ptr +# rsi - array size +iterative_cooley_tukey: + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + sub rsp, 40 + mov r12, rdi + mov r13, rsi + call bit_reverse # Bit reversing array + sar r13, 4 # Calculate log2(N) + cvtsi2sdq xmm0, r13 + call log2 + cvttsd2si rax, xmm0 + mov QWORD PTR [rsp], rax # Save it to the stack + mov r14, 1 +iter_ct_loop_i: + cmp r14, rax # Check if r14 is greater then log2(N) + jg iter_ct_end_i + movsd xmm0, two # Calculate stride = 2^(r14) + cvtsi2sdq xmm1, r14 + call pow + cvttsd2si rbp, xmm0 + movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride) + divsd xmm1, xmm0 + pxor xmm0, xmm0 + call cexp + movq QWORD PTR [rsp + 8], xmm0 # Save it to stack + movq QWORD PTR [rsp + 16], xmm1 + xor r15, r15 +iter_ct_loop_j: + cmp r15, r13 # Check if r15 is less then array size + je iter_ct_end_j + movsd xmm4, one # Save 1 + 0i to stack + pxor xmm5, xmm5 + movsd QWORD PTR [rsp + 24], xmm4 + movsd QWORD PTR [rsp + 32], xmm5 + xor rbx, rbx + mov rax, rbp # Calculate stride / 2 + sar rax, 1 +iter_ct_loop_k: + cmp rbx, rax # Check if rbx is less then stride / 2 + je iter_ct_end_k + mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j] + add r8, rbx + sal r8, 4 + mov r9, rbp + sal r9, 3 + add r9, r8 + lea r9, [r12 + r9] + lea r8, [r12 + r8] + movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2] + movsd xmm1, QWORD PTR [r9 + 8] + movsd xmm2, QWORD PTR [rsp + 24] + movsd xmm3, QWORD PTR [rsp + 32] + call __muldc3 + movsd xmm2, QWORD PTR [r8] + movsd xmm3, QWORD PTR [r8 + 8] + subsd xmm2, xmm0 + subsd xmm3, xmm1 + movsd QWORD PTR [r9], xmm2 # Saving answer + movsd QWORD PTR [r9 + 8], xmm3 + movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j]) + movsd xmm1, QWORD PTR [r8 + 8] + subsd xmm2, xmm0 + subsd xmm3, xmm1 + subsd xmm0, xmm2 + subsd xmm1, xmm3 + movsd QWORD PTR [r8], xmm0 # Saving answer + movsd QWORD PTR [r8 + 8], xmm1 + movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w + movsd xmm1, QWORD PTR [rsp + 32] + movsd xmm2, QWORD PTR [rsp + 8] + movsd xmm3, QWORD PTR [rsp + 16] + call __muldc3 + movsd QWORD PTR [rsp + 24], xmm0 # Saving answer + movsd QWORD PTR [rsp + 32], xmm1 + add rbx, 1 + mov rax, rbp + sar rax, 1 + jmp iter_ct_loop_k +iter_ct_end_k: + add r15, rbp + jmp iter_ct_loop_j +iter_ct_end_j: + add r14, 1 + mov rax, QWORD PTR [rsp] + jmp iter_ct_loop_i +iter_ct_end_i: + add rsp, 40 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +# rdi - array a ptr +# rsi - array b ptr +# rdx - array size +approx: + push r12 + push r13 + push r14 + push r15 + mov r12, rdi + mov r13, rsi + mov r14, rdx + lea r15, [rdi + rdx] + sub rsp, 8 +approx_loop: + cmp r12, r15 + je approx_return + movsd xmm0, QWORD PTR[r13] + movsd xmm1, QWORD PTR[r13 + 8] + call cabs + movsd QWORD PTR [rsp], xmm0 + movsd xmm0, QWORD PTR[r12] + movsd xmm1, QWORD PTR[r12 + 8] + call cabs + movsd xmm1, QWORD PTR [rsp] + subsd xmm0, xmm1 + mov rdi, OFFSET fmt + mov rax, 1 + call printf + add r12, 16 + add r13, 16 + jmp approx_loop +approx_return: + add rsp, 8 + pop r15 + pop r14 + pop r13 + pop r12 + ret + +main: + push r12 + sub rsp, 2048 + mov rdi, 0 + call time + mov edi, eax + call srand + lea r12, [rsp + 1024] +loop: + cmp r12, rsp + je end_loop + sub r12, 16 + call rand + cvtsi2sd xmm0, rax + divsd xmm0, rand_max + lea rax, [r12 + 1024] + movsd QWORD PTR [r12], xmm0 + movsd QWORD PTR [rax], xmm0 + mov QWORD PTR [r12 + 8], 0 + mov QWORD PTR [rax + 8], 0 + jmp loop +end_loop: + mov rdi, rsp + mov rsi, 1024 + call iterative_cooley_tukey + lea rdi, [rsp + 1024] + mov rsi, 1024 + call cooley_tukey + mov rdi, rsp + lea rsi, [rsp + 1024] + mov rdx, 1024 + call approx + add rsp, 2048 + pop r12 + ret + From 93c76a5b884ab02efad92ec2a577352c21d2b964 Mon Sep 17 00:00:00 2001 From: Gathros <6323830+Gathros@users.noreply.github.com> Date: Mon, 5 Nov 2018 10:43:59 +0000 Subject: [PATCH 2/4] Updating cooley_tukey.md --- contents/cooley_tukey/cooley_tukey.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/contents/cooley_tukey/cooley_tukey.md b/contents/cooley_tukey/cooley_tukey.md index c2a7434ab..addbb7342 100644 --- a/contents/cooley_tukey/cooley_tukey.md +++ b/contents/cooley_tukey/cooley_tukey.md @@ -83,6 +83,8 @@ For some reason, though, putting code to this transformation really helped me fi [import:5-11, lang:"python"](code/python/fft.py) {% sample lang="scratch" %} [import:4-13, lang:"julia"](code/julia/fft.jl) +{% sample lang="asm-x64" %} +[import:15-74, lang:"asm-x64"](code/asm-x64/fft.s) {% endmethod %} In this function, we define `n` to be a set of integers from $$0 \rightarrow N-1$$ and arrange them to be a column. @@ -130,6 +132,8 @@ In the end, the code looks like: [import:13-24, lang:"python"](code/python/fft.py) {% sample lang="scratch" %} [import:16-32, lang:"julia"](code/julia/fft.jl) +{% sample lang="asm-x64" %} +[import:76-165, lang:"asm-x64"](code/asm-x64/fft.s) {% endmethod %} As a side note, we are enforcing that the array must be a power of 2 for the operation to work. @@ -236,6 +240,8 @@ Note: I implemented this in Julia because the code seems more straightforward in [import, lang:"python"](code/python/fft.py) {% sample lang="scratch" %} Some rather impressive scratch code was submitted by Jie and can be found here: https://scratch.mit.edu/projects/37759604/#editor +{% sample lang="asm-x64" %} +[import, lang:"asm-x64"](code/asm-x64/fft.s) {% endmethod %} From a747b4d52a35a9d08ca1e3be0f1c0385c3961ca1 Mon Sep 17 00:00:00 2001 From: Gathros <6323830+Gathros@users.noreply.github.com> Date: Sat, 17 Nov 2018 23:21:11 +0000 Subject: [PATCH 3/4] return 0 on fft.s --- contents/cooley_tukey/code/asm-x64/fft.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s index 1470e87ce..b8fdbe829 100644 --- a/contents/cooley_tukey/code/asm-x64/fft.s +++ b/contents/cooley_tukey/code/asm-x64/fft.s @@ -392,7 +392,9 @@ end_loop: lea rsi, [rsp + 1024] mov rdx, 1024 call approx + xor rax, rax add rsp, 2048 pop r12 ret + From 44de64fb7351aa10b7d3d13543f46cfe11f8470a Mon Sep 17 00:00:00 2001 From: Gathros <6323830+Gathros@users.noreply.github.com> Date: Mon, 24 Dec 2018 11:00:35 +0000 Subject: [PATCH 4/4] small fixes to fft.s --- contents/cooley_tukey/code/asm-x64/fft.s | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s index b8fdbe829..566ec3ae7 100644 --- a/contents/cooley_tukey/code/asm-x64/fft.s +++ b/contents/cooley_tukey/code/asm-x64/fft.s @@ -228,8 +228,7 @@ iterative_cooley_tukey: push r14 push r15 push rbx - push rbp - sub rsp, 40 + sub rsp, 48 mov r12, rdi mov r13, rsi call bit_reverse # Bit reversing array @@ -245,7 +244,8 @@ iter_ct_loop_i: movsd xmm0, two # Calculate stride = 2^(r14) cvtsi2sdq xmm1, r14 call pow - cvttsd2si rbp, xmm0 + cvttsd2si r10, xmm0 + mov QWORD PTR [rsp + 40], r10# move stride to stack movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride) divsd xmm1, xmm0 pxor xmm0, xmm0 @@ -261,7 +261,7 @@ iter_ct_loop_j: movsd QWORD PTR [rsp + 24], xmm4 movsd QWORD PTR [rsp + 32], xmm5 xor rbx, rbx - mov rax, rbp # Calculate stride / 2 + mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2 sar rax, 1 iter_ct_loop_k: cmp rbx, rax # Check if rbx is less then stride / 2 @@ -269,7 +269,7 @@ iter_ct_loop_k: mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j] add r8, rbx sal r8, 4 - mov r9, rbp + mov r9, QWORD PTR [rsp + 40] sal r9, 3 add r9, r8 lea r9, [r12 + r9] @@ -301,19 +301,18 @@ iter_ct_loop_k: movsd QWORD PTR [rsp + 24], xmm0 # Saving answer movsd QWORD PTR [rsp + 32], xmm1 add rbx, 1 - mov rax, rbp + mov rax, QWORD PTR [rsp + 40] sar rax, 1 jmp iter_ct_loop_k iter_ct_end_k: - add r15, rbp + add r15, QWORD PTR [rsp + 40] jmp iter_ct_loop_j iter_ct_end_j: add r14, 1 mov rax, QWORD PTR [rsp] jmp iter_ct_loop_i iter_ct_end_i: - add rsp, 40 - pop rbp + add rsp, 48 pop rbx pop r15 pop r14