From 60816d43004a0b3683fb31360562f9ec595f002d Mon Sep 17 00:00:00 2001
From: Gathros <6323830+Gathros@users.noreply.github.com>
Date: Mon, 5 Nov 2018 10:40:16 +0000
Subject: [PATCH 1/4] Adding fft.s

---
 contents/cooley_tukey/code/asm-x64/fft.s | 398 +++++++++++++++++++++++
 1 file changed, 398 insertions(+)
 create mode 100644 contents/cooley_tukey/code/asm-x64/fft.s

diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s
new file mode 100644
index 000000000..1470e87ce
--- /dev/null
+++ b/contents/cooley_tukey/code/asm-x64/fft.s
@@ -0,0 +1,398 @@
+.intel_syntax noprefix
+
+.section .rodata
+  two:           .double 2.0
+  one:           .double 1.0
+  two_pi:        .double -6.28318530718
+  rand_max:      .long 4290772992
+                 .long 1105199103
+  fmt:           .string "%g\n"
+
+.section .text
+  .global main
+  .extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2
+
+# rdi - array ptr
+# rsi - array size
+dft:
+  push   rbx
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi                    # Save parameters
+  mov    r13, rsi
+  sub    rsp, r13                    # Make a double complex array
+  xor    r14, r14                    # Set index to 0
+dft_loop_i:
+  cmp    r14, r13                    # Check if index is equal to array size
+  je     dft_end_i
+  lea    rax, [rsp + r14]            # Set tmp array to zero at r14
+  mov    QWORD PTR [rax], 0
+  mov    QWORD PTR [rax + 8], 0
+  xor    r15, r15                    # Set second index to 0
+dft_loop_j:
+  cmp    r15, r13                    # Check if the index is equal to array size
+  je     dft_end_j
+  movsd  xmm1, two_pi                # Calculate xmm1 = -2pi * i * j / N
+  mov    rax, r14
+  imul   rax, r15
+  shr    rax, 4
+  cvtsi2sdq xmm2, rax
+  mulsd  xmm1, xmm2
+  cvtsi2sdq xmm2, r13
+  divsd  xmm1, xmm2
+  pxor   xmm0, xmm0                  # Set xmm0 to 0
+  call   cexp
+  lea    rax, [r12 + r15]            # Calculate X[i] * cexp(-2pi * i * j / N)
+  movsd  xmm2, QWORD PTR [rax]
+  movsd  xmm3, QWORD PTR [rax + 8]
+  call   __muldc3
+  lea    rax, [rsp + r14]
+  movsd  xmm6, QWORD PTR [rax]       # Sum to tmp array
+  movsd  xmm7, QWORD PTR [rax + 8]
+  addsd  xmm6, xmm0
+  addsd  xmm7, xmm1
+  movsd  QWORD PTR [rax], xmm6       # Save to tmp array
+  movsd  QWORD PTR [rax + 8], xmm7
+  add    r15, 16
+  jmp    dft_loop_j
+dft_end_j:
+  add    r14, 16
+  jmp    dft_loop_i
+dft_end_i:
+  mov    rdi, r12                    # Move tmp array to array ptr
+  mov    rsi, rsp
+  mov    rdx, r13
+  call   memcpy
+  add    rsp, r13
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  pop    rbx
+  ret
+
+# rdi - array ptr
+# rsi - array size
+cooley_tukey:
+  cmp    rsi, 16                     # Check if size if greater then 1
+  jle    cooley_tukey_return
+  push   rbx
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi                    # Save parameters
+  mov    r13, rsi
+  mov    r14, rsi                    # Save N / 2
+  shr    r14, 1
+  sub    rsp, r14                    # Make a tmp array
+  xor    r15, r15
+  mov    rbx, r12
+cooley_tukey_spliting:
+  cmp    r15, r14
+  je     cooley_tukey_split
+  lea    rax, [r12 + 2 * r15]        # Moving all odd entries to the front of the array
+  movaps xmm0, XMMWORD PTR [rax + 16]
+  movaps xmm1, XMMWORD PTR [rax]
+  movaps XMMWORD PTR [rsp + r15], xmm0
+  movaps XMMWORD PTR [rbx], xmm1
+  add    rbx, 16
+  add    r15, 16
+  jmp    cooley_tukey_spliting
+cooley_tukey_split:
+  mov    rax, rsp
+  lea    rdi, [r12 + r13]
+cooley_tukey_mov_data:
+  cmp    rbx, rdi
+  je     cooley_tukey_moved
+  movaps xmm0, XMMWORD PTR [rax]
+  movaps XMMWORD PTR [rbx], xmm0
+  add    rbx, 16
+  add    rax, 16
+  jmp    cooley_tukey_mov_data
+cooley_tukey_moved:
+  add    rsp, r14
+  mov    rdi, r12                   # Makking a recursive call
+  mov    rsi, r14
+  call   cooley_tukey
+  lea    rdi, [r12 + r14]           # Makking a recursive call
+  mov    rsi, r14
+  call   cooley_tukey
+  lea    rbx, [r12 + r14]
+  mov    r14, rbx
+  mov    r15, r12
+cooley_tukey_loop:
+  cmp    r15, rbx
+  je     cooley_tukey_end
+  pxor   xmm0, xmm0                 # Calculate cexp(-2.0 * I * M_PI * i / N)
+  movsd  xmm1, two_pi
+  mov    rax, r14
+  sub    rax, rbx
+  cvtsi2sdq xmm2, rax
+  cvtsi2sdq xmm3, r13
+  divsd  xmm2, xmm3
+  mulsd  xmm1, xmm2
+  call   cexp
+  movq   xmm2, QWORD PTR [r14]      # Calculating X[i] - cexp() * X[i + N / 2]
+  movq   xmm3, QWORD PTR [r14 + 8]
+  call   __muldc3
+  movq   xmm2, QWORD PTR [r15]
+  movq   xmm3, QWORD PTR [r15 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  movq   QWORD PTR [r14], xmm2      # Save value in X[i + N / 2]
+  movq   QWORD PTR [r14 + 8], xmm3
+  movq   xmm0, QWORD PTR [r15]      # Calculating X[i] -= X[i + N / 2] - X[i]
+  movq   xmm1, QWORD PTR [r15 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  subsd  xmm0, xmm2
+  subsd  xmm1, xmm3
+  movq   QWORD PTR [r15], xmm0
+  movq   QWORD PTR [r15 + 8], xmm1
+  add    r14, 16
+  add    r15, 16
+  jmp    cooley_tukey_loop
+cooley_tukey_end:
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  pop    rbx
+cooley_tukey_return:
+  ret
+
+# rdi - array ptr
+# rsi - array size
+bit_reverse:
+  push   rbx
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi                  # Save parameters
+  mov    r13, rsi
+  shr    r13, 4
+  xor    r14, r14                  # Loop through all entries
+bit_reverse_entries:
+  cmp    r14, r13
+  je     bit_reverse_return
+  cvtsi2sdq xmm0, r13              # Calculating the number of bit in N
+  call   log2
+  cvttsd2si rcx, xmm0
+  mov    rdi, 1                    # Calculating (1 << log2(N)) - 1
+  sal    edi, cl
+  sub    edi, 1
+  sub    ecx, 1
+  mov    rax, r14
+  mov    r15, r14
+bit_reverse_loop:
+  sar    r15                       # Check if r15 is 0
+  je     bit_reverse_reversed
+  sal    rax, 1                    # Calculating (rax << 1) | (r15 & 1)
+  mov    rsi, r15
+  and    rsi, 1
+  or     rax, rsi
+  sub    ecx, 1                    # Decrement bit count
+  jmp    bit_reverse_loop
+bit_reverse_reversed:
+  sal    eax, cl                   # Calculate (rax << rcx) & (1 << bit count)
+  and    rax, rdi
+  cmp    rax, r14                  # Check if rax is greater then r14
+  jle    bit_reverse_no_swap       # If so then swap entries
+  shl    rax, 4                    # Times index by 16 to get bytes to entry
+  shl    r14, 4
+  movaps xmm0, XMMWORD PTR [r12 + rax]
+  movaps xmm1, XMMWORD PTR [r12 + r14]
+  movaps XMMWORD PTR [r12 + rax], xmm1
+  movaps XMMWORD PTR [r12 + r14], xmm0
+  shr    r14, 4
+bit_reverse_no_swap:
+  add    r14, 1
+  jmp    bit_reverse_entries
+bit_reverse_return:
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  pop    rbx
+  ret
+
+# rdi - array ptr
+# rsi - array size
+iterative_cooley_tukey:
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  push   rbx
+  push   rbp
+  sub    rsp, 40
+  mov    r12, rdi
+  mov    r13, rsi
+  call   bit_reverse              # Bit reversing array
+  sar    r13, 4                   # Calculate log2(N)
+  cvtsi2sdq xmm0, r13
+  call   log2
+  cvttsd2si rax, xmm0
+  mov    QWORD PTR [rsp], rax     # Save it to the stack
+  mov    r14, 1
+iter_ct_loop_i:
+  cmp    r14, rax                 # Check if r14 is greater then log2(N)
+  jg     iter_ct_end_i
+  movsd  xmm0, two                # Calculate stride = 2^(r14)
+  cvtsi2sdq xmm1, r14
+  call   pow
+  cvttsd2si rbp, xmm0
+  movsd  xmm1, two_pi             # Calculating cexp(-2pi * I / stride)
+  divsd  xmm1, xmm0
+  pxor   xmm0, xmm0
+  call   cexp
+  movq   QWORD PTR [rsp + 8], xmm0  # Save it to stack
+  movq   QWORD PTR [rsp + 16], xmm1
+  xor    r15, r15
+iter_ct_loop_j:
+  cmp    r15, r13                 # Check if r15 is less then array size
+  je     iter_ct_end_j
+  movsd  xmm4, one                # Save 1 + 0i to stack
+  pxor   xmm5, xmm5
+  movsd  QWORD PTR [rsp + 24], xmm4
+  movsd  QWORD PTR [rsp + 32], xmm5
+  xor    rbx, rbx
+  mov    rax, rbp                 # Calculate stride / 2
+  sar    rax, 1
+iter_ct_loop_k:
+  cmp    rbx, rax                 # Check if rbx is less then stride / 2
+  je     iter_ct_end_k
+  mov    r8, r15                  # Saving pointers to X[k + j + stride / 2] and X[k + j]
+  add    r8, rbx
+  sal    r8, 4
+  mov    r9, rbp
+  sal    r9, 3
+  add    r9, r8
+  lea    r9, [r12 + r9]
+  lea    r8, [r12 + r8]
+  movsd  xmm0, QWORD PTR [r9]     # Calculate X[k + j] - v * X[k + j + stride / 2]
+  movsd  xmm1, QWORD PTR [r9 + 8]
+  movsd  xmm2, QWORD PTR [rsp + 24]
+  movsd  xmm3, QWORD PTR [rsp + 32]
+  call   __muldc3
+  movsd  xmm2, QWORD PTR [r8]
+  movsd  xmm3, QWORD PTR [r8 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  movsd  QWORD PTR [r9], xmm2     # Saving answer
+  movsd  QWORD PTR [r9 + 8], xmm3
+  movsd  xmm0, QWORD PTR [r8]     # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
+  movsd  xmm1, QWORD PTR [r8 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  subsd  xmm0, xmm2
+  subsd  xmm1, xmm3
+  movsd  QWORD PTR [r8], xmm0     # Saving answer
+  movsd  QWORD PTR [r8 + 8], xmm1
+  movsd  xmm0, QWORD PTR [rsp + 24] # Calculating v * w
+  movsd  xmm1, QWORD PTR [rsp + 32]
+  movsd  xmm2, QWORD PTR [rsp + 8]
+  movsd  xmm3, QWORD PTR [rsp + 16]
+  call   __muldc3
+  movsd  QWORD PTR [rsp + 24], xmm0 # Saving answer
+  movsd  QWORD PTR [rsp + 32], xmm1
+  add    rbx, 1
+  mov    rax, rbp
+  sar    rax, 1
+  jmp    iter_ct_loop_k
+iter_ct_end_k:
+  add    r15, rbp
+  jmp    iter_ct_loop_j
+iter_ct_end_j:
+  add    r14, 1
+  mov    rax, QWORD PTR [rsp]
+  jmp    iter_ct_loop_i
+iter_ct_end_i:
+  add    rsp, 40
+  pop    rbp
+  pop    rbx
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+# rdi - array a ptr
+# rsi - array b ptr
+# rdx - array size
+approx:
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi
+  mov    r13, rsi
+  mov    r14, rdx
+  lea    r15, [rdi + rdx]
+  sub    rsp, 8
+approx_loop:
+  cmp    r12, r15
+  je     approx_return
+  movsd  xmm0, QWORD PTR[r13]
+  movsd  xmm1, QWORD PTR[r13 + 8]
+  call   cabs
+  movsd  QWORD PTR [rsp], xmm0
+  movsd  xmm0, QWORD PTR[r12]
+  movsd  xmm1, QWORD PTR[r12 + 8]
+  call   cabs
+  movsd  xmm1, QWORD PTR [rsp]
+  subsd  xmm0, xmm1
+  mov    rdi, OFFSET fmt
+  mov    rax, 1
+  call   printf
+  add    r12, 16
+  add    r13, 16
+  jmp    approx_loop
+approx_return:
+  add    rsp, 8
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+main:
+  push   r12
+  sub    rsp, 2048
+  mov    rdi, 0
+  call   time
+  mov    edi, eax
+  call   srand
+  lea    r12, [rsp + 1024]
+loop:
+  cmp    r12, rsp
+  je     end_loop
+  sub    r12, 16
+  call   rand
+  cvtsi2sd xmm0, rax
+  divsd  xmm0, rand_max
+  lea    rax, [r12 + 1024]
+  movsd  QWORD PTR [r12], xmm0
+  movsd  QWORD PTR [rax], xmm0
+  mov    QWORD PTR [r12 + 8], 0
+  mov    QWORD PTR [rax + 8], 0
+  jmp    loop
+end_loop:
+  mov    rdi, rsp
+  mov    rsi, 1024
+  call   iterative_cooley_tukey
+  lea    rdi, [rsp + 1024]
+  mov    rsi, 1024
+  call   cooley_tukey
+  mov    rdi, rsp
+  lea    rsi, [rsp + 1024]
+  mov    rdx, 1024
+  call   approx
+  add    rsp, 2048
+  pop    r12
+  ret
+

From 93c76a5b884ab02efad92ec2a577352c21d2b964 Mon Sep 17 00:00:00 2001
From: Gathros <6323830+Gathros@users.noreply.github.com>
Date: Mon, 5 Nov 2018 10:43:59 +0000
Subject: [PATCH 2/4] Updating cooley_tukey.md

---
 contents/cooley_tukey/cooley_tukey.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/contents/cooley_tukey/cooley_tukey.md b/contents/cooley_tukey/cooley_tukey.md
index c2a7434ab..addbb7342 100644
--- a/contents/cooley_tukey/cooley_tukey.md
+++ b/contents/cooley_tukey/cooley_tukey.md
@@ -83,6 +83,8 @@ For some reason, though, putting code to this transformation really helped me fi
 [import:5-11, lang:"python"](code/python/fft.py)
 {% sample lang="scratch" %}
 [import:4-13, lang:"julia"](code/julia/fft.jl)
+{% sample lang="asm-x64" %}
+[import:15-74, lang:"asm-x64"](code/asm-x64/fft.s)
 {% endmethod %}
 
 In this function, we define `n` to be a set of integers from $$0 \rightarrow N-1$$ and arrange them to be a column.
@@ -130,6 +132,8 @@ In the end, the code looks like:
 [import:13-24, lang:"python"](code/python/fft.py)
 {% sample lang="scratch" %}
 [import:16-32, lang:"julia"](code/julia/fft.jl)
+{% sample lang="asm-x64" %}
+[import:76-165, lang:"asm-x64"](code/asm-x64/fft.s)
 {% endmethod %}
 
 As a side note, we are enforcing that the array must be a power of 2 for the operation to work.
@@ -236,6 +240,8 @@ Note: I implemented this in Julia because the code seems more straightforward in
 [import, lang:"python"](code/python/fft.py)
 {% sample lang="scratch" %}
 Some rather impressive scratch code was submitted by Jie and can be found here: https://scratch.mit.edu/projects/37759604/#editor
+{% sample lang="asm-x64" %}
+[import, lang:"asm-x64"](code/asm-x64/fft.s)
 {% endmethod %}
 
 

From a747b4d52a35a9d08ca1e3be0f1c0385c3961ca1 Mon Sep 17 00:00:00 2001
From: Gathros <6323830+Gathros@users.noreply.github.com>
Date: Sat, 17 Nov 2018 23:21:11 +0000
Subject: [PATCH 3/4] return 0 on fft.s

---
 contents/cooley_tukey/code/asm-x64/fft.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s
index 1470e87ce..b8fdbe829 100644
--- a/contents/cooley_tukey/code/asm-x64/fft.s
+++ b/contents/cooley_tukey/code/asm-x64/fft.s
@@ -392,7 +392,9 @@ end_loop:
   lea    rsi, [rsp + 1024]
   mov    rdx, 1024
   call   approx
+  xor    rax, rax
   add    rsp, 2048
   pop    r12
   ret
 
+

From 44de64fb7351aa10b7d3d13543f46cfe11f8470a Mon Sep 17 00:00:00 2001
From: Gathros <6323830+Gathros@users.noreply.github.com>
Date: Mon, 24 Dec 2018 11:00:35 +0000
Subject: [PATCH 4/4] small fixes to fft.s

---
 contents/cooley_tukey/code/asm-x64/fft.s | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s
index b8fdbe829..566ec3ae7 100644
--- a/contents/cooley_tukey/code/asm-x64/fft.s
+++ b/contents/cooley_tukey/code/asm-x64/fft.s
@@ -228,8 +228,7 @@ iterative_cooley_tukey:
   push   r14
   push   r15
   push   rbx
-  push   rbp
-  sub    rsp, 40
+  sub    rsp, 48
   mov    r12, rdi
   mov    r13, rsi
   call   bit_reverse              # Bit reversing array
@@ -245,7 +244,8 @@ iter_ct_loop_i:
   movsd  xmm0, two                # Calculate stride = 2^(r14)
   cvtsi2sdq xmm1, r14
   call   pow
-  cvttsd2si rbp, xmm0
+  cvttsd2si r10, xmm0
+  mov    QWORD PTR [rsp + 40], r10# move stride to stack
   movsd  xmm1, two_pi             # Calculating cexp(-2pi * I / stride)
   divsd  xmm1, xmm0
   pxor   xmm0, xmm0
@@ -261,7 +261,7 @@ iter_ct_loop_j:
   movsd  QWORD PTR [rsp + 24], xmm4
   movsd  QWORD PTR [rsp + 32], xmm5
   xor    rbx, rbx
-  mov    rax, rbp                 # Calculate stride / 2
+  mov    rax, QWORD PTR [rsp + 40]# Calculate stride / 2
   sar    rax, 1
 iter_ct_loop_k:
   cmp    rbx, rax                 # Check if rbx is less then stride / 2
@@ -269,7 +269,7 @@ iter_ct_loop_k:
   mov    r8, r15                  # Saving pointers to X[k + j + stride / 2] and X[k + j]
   add    r8, rbx
   sal    r8, 4
-  mov    r9, rbp
+  mov    r9, QWORD PTR [rsp + 40]
   sal    r9, 3
   add    r9, r8
   lea    r9, [r12 + r9]
@@ -301,19 +301,18 @@ iter_ct_loop_k:
   movsd  QWORD PTR [rsp + 24], xmm0 # Saving answer
   movsd  QWORD PTR [rsp + 32], xmm1
   add    rbx, 1
-  mov    rax, rbp
+  mov    rax, QWORD PTR [rsp + 40]
   sar    rax, 1
   jmp    iter_ct_loop_k
 iter_ct_end_k:
-  add    r15, rbp
+  add    r15, QWORD PTR [rsp + 40]
   jmp    iter_ct_loop_j
 iter_ct_end_j:
   add    r14, 1
   mov    rax, QWORD PTR [rsp]
   jmp    iter_ct_loop_i
 iter_ct_end_i:
-  add    rsp, 40
-  pop    rbp
+  add    rsp, 48
   pop    rbx
   pop    r15
   pop    r14