Skip to content

Commit 1048a07

Browse files
authored
Merge pull request #542 from Gathros/fft_x64
Adding FFT in X86_64
2 parents 6daa16f + 44de64f commit 1048a07

File tree

2 files changed

+405
-0
lines changed

2 files changed

+405
-0
lines changed
+399
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
.intel_syntax noprefix
2+
3+
.section .rodata
4+
two: .double 2.0
5+
one: .double 1.0
6+
two_pi: .double -6.28318530718
7+
rand_max: .long 4290772992
8+
.long 1105199103
9+
fmt: .string "%g\n"
10+
11+
.section .text
12+
.global main
13+
.extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2
14+
15+
# rdi - array ptr
16+
# rsi - array size
17+
dft:
18+
push rbx
19+
push r12
20+
push r13
21+
push r14
22+
push r15
23+
mov r12, rdi # Save parameters
24+
mov r13, rsi
25+
sub rsp, r13 # Make a double complex array
26+
xor r14, r14 # Set index to 0
27+
dft_loop_i:
28+
cmp r14, r13 # Check if index is equal to array size
29+
je dft_end_i
30+
lea rax, [rsp + r14] # Set tmp array to zero at r14
31+
mov QWORD PTR [rax], 0
32+
mov QWORD PTR [rax + 8], 0
33+
xor r15, r15 # Set second index to 0
34+
dft_loop_j:
35+
cmp r15, r13 # Check if the index is equal to array size
36+
je dft_end_j
37+
movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N
38+
mov rax, r14
39+
imul rax, r15
40+
shr rax, 4
41+
cvtsi2sdq xmm2, rax
42+
mulsd xmm1, xmm2
43+
cvtsi2sdq xmm2, r13
44+
divsd xmm1, xmm2
45+
pxor xmm0, xmm0 # Set xmm0 to 0
46+
call cexp
47+
lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N)
48+
movsd xmm2, QWORD PTR [rax]
49+
movsd xmm3, QWORD PTR [rax + 8]
50+
call __muldc3
51+
lea rax, [rsp + r14]
52+
movsd xmm6, QWORD PTR [rax] # Sum to tmp array
53+
movsd xmm7, QWORD PTR [rax + 8]
54+
addsd xmm6, xmm0
55+
addsd xmm7, xmm1
56+
movsd QWORD PTR [rax], xmm6 # Save to tmp array
57+
movsd QWORD PTR [rax + 8], xmm7
58+
add r15, 16
59+
jmp dft_loop_j
60+
dft_end_j:
61+
add r14, 16
62+
jmp dft_loop_i
63+
dft_end_i:
64+
mov rdi, r12 # Move tmp array to array ptr
65+
mov rsi, rsp
66+
mov rdx, r13
67+
call memcpy
68+
add rsp, r13
69+
pop r15
70+
pop r14
71+
pop r13
72+
pop r12
73+
pop rbx
74+
ret
75+
76+
# rdi - array ptr
77+
# rsi - array size
78+
cooley_tukey:
79+
cmp rsi, 16 # Check if size if greater then 1
80+
jle cooley_tukey_return
81+
push rbx
82+
push r12
83+
push r13
84+
push r14
85+
push r15
86+
mov r12, rdi # Save parameters
87+
mov r13, rsi
88+
mov r14, rsi # Save N / 2
89+
shr r14, 1
90+
sub rsp, r14 # Make a tmp array
91+
xor r15, r15
92+
mov rbx, r12
93+
cooley_tukey_spliting:
94+
cmp r15, r14
95+
je cooley_tukey_split
96+
lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array
97+
movaps xmm0, XMMWORD PTR [rax + 16]
98+
movaps xmm1, XMMWORD PTR [rax]
99+
movaps XMMWORD PTR [rsp + r15], xmm0
100+
movaps XMMWORD PTR [rbx], xmm1
101+
add rbx, 16
102+
add r15, 16
103+
jmp cooley_tukey_spliting
104+
cooley_tukey_split:
105+
mov rax, rsp
106+
lea rdi, [r12 + r13]
107+
cooley_tukey_mov_data:
108+
cmp rbx, rdi
109+
je cooley_tukey_moved
110+
movaps xmm0, XMMWORD PTR [rax]
111+
movaps XMMWORD PTR [rbx], xmm0
112+
add rbx, 16
113+
add rax, 16
114+
jmp cooley_tukey_mov_data
115+
cooley_tukey_moved:
116+
add rsp, r14
117+
mov rdi, r12 # Makking a recursive call
118+
mov rsi, r14
119+
call cooley_tukey
120+
lea rdi, [r12 + r14] # Makking a recursive call
121+
mov rsi, r14
122+
call cooley_tukey
123+
lea rbx, [r12 + r14]
124+
mov r14, rbx
125+
mov r15, r12
126+
cooley_tukey_loop:
127+
cmp r15, rbx
128+
je cooley_tukey_end
129+
pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N)
130+
movsd xmm1, two_pi
131+
mov rax, r14
132+
sub rax, rbx
133+
cvtsi2sdq xmm2, rax
134+
cvtsi2sdq xmm3, r13
135+
divsd xmm2, xmm3
136+
mulsd xmm1, xmm2
137+
call cexp
138+
movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2]
139+
movq xmm3, QWORD PTR [r14 + 8]
140+
call __muldc3
141+
movq xmm2, QWORD PTR [r15]
142+
movq xmm3, QWORD PTR [r15 + 8]
143+
subsd xmm2, xmm0
144+
subsd xmm3, xmm1
145+
movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2]
146+
movq QWORD PTR [r14 + 8], xmm3
147+
movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i]
148+
movq xmm1, QWORD PTR [r15 + 8]
149+
subsd xmm2, xmm0
150+
subsd xmm3, xmm1
151+
subsd xmm0, xmm2
152+
subsd xmm1, xmm3
153+
movq QWORD PTR [r15], xmm0
154+
movq QWORD PTR [r15 + 8], xmm1
155+
add r14, 16
156+
add r15, 16
157+
jmp cooley_tukey_loop
158+
cooley_tukey_end:
159+
pop r15
160+
pop r14
161+
pop r13
162+
pop r12
163+
pop rbx
164+
cooley_tukey_return:
165+
ret
166+
167+
# rdi - array ptr
168+
# rsi - array size
169+
bit_reverse:
170+
push rbx
171+
push r12
172+
push r13
173+
push r14
174+
push r15
175+
mov r12, rdi # Save parameters
176+
mov r13, rsi
177+
shr r13, 4
178+
xor r14, r14 # Loop through all entries
179+
bit_reverse_entries:
180+
cmp r14, r13
181+
je bit_reverse_return
182+
cvtsi2sdq xmm0, r13 # Calculating the number of bit in N
183+
call log2
184+
cvttsd2si rcx, xmm0
185+
mov rdi, 1 # Calculating (1 << log2(N)) - 1
186+
sal edi, cl
187+
sub edi, 1
188+
sub ecx, 1
189+
mov rax, r14
190+
mov r15, r14
191+
bit_reverse_loop:
192+
sar r15 # Check if r15 is 0
193+
je bit_reverse_reversed
194+
sal rax, 1 # Calculating (rax << 1) | (r15 & 1)
195+
mov rsi, r15
196+
and rsi, 1
197+
or rax, rsi
198+
sub ecx, 1 # Decrement bit count
199+
jmp bit_reverse_loop
200+
bit_reverse_reversed:
201+
sal eax, cl # Calculate (rax << rcx) & (1 << bit count)
202+
and rax, rdi
203+
cmp rax, r14 # Check if rax is greater then r14
204+
jle bit_reverse_no_swap # If so then swap entries
205+
shl rax, 4 # Times index by 16 to get bytes to entry
206+
shl r14, 4
207+
movaps xmm0, XMMWORD PTR [r12 + rax]
208+
movaps xmm1, XMMWORD PTR [r12 + r14]
209+
movaps XMMWORD PTR [r12 + rax], xmm1
210+
movaps XMMWORD PTR [r12 + r14], xmm0
211+
shr r14, 4
212+
bit_reverse_no_swap:
213+
add r14, 1
214+
jmp bit_reverse_entries
215+
bit_reverse_return:
216+
pop r15
217+
pop r14
218+
pop r13
219+
pop r12
220+
pop rbx
221+
ret
222+
223+
# rdi - array ptr
224+
# rsi - array size
225+
iterative_cooley_tukey:
226+
push r12
227+
push r13
228+
push r14
229+
push r15
230+
push rbx
231+
sub rsp, 48
232+
mov r12, rdi
233+
mov r13, rsi
234+
call bit_reverse # Bit reversing array
235+
sar r13, 4 # Calculate log2(N)
236+
cvtsi2sdq xmm0, r13
237+
call log2
238+
cvttsd2si rax, xmm0
239+
mov QWORD PTR [rsp], rax # Save it to the stack
240+
mov r14, 1
241+
iter_ct_loop_i:
242+
cmp r14, rax # Check if r14 is greater then log2(N)
243+
jg iter_ct_end_i
244+
movsd xmm0, two # Calculate stride = 2^(r14)
245+
cvtsi2sdq xmm1, r14
246+
call pow
247+
cvttsd2si r10, xmm0
248+
mov QWORD PTR [rsp + 40], r10# move stride to stack
249+
movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride)
250+
divsd xmm1, xmm0
251+
pxor xmm0, xmm0
252+
call cexp
253+
movq QWORD PTR [rsp + 8], xmm0 # Save it to stack
254+
movq QWORD PTR [rsp + 16], xmm1
255+
xor r15, r15
256+
iter_ct_loop_j:
257+
cmp r15, r13 # Check if r15 is less then array size
258+
je iter_ct_end_j
259+
movsd xmm4, one # Save 1 + 0i to stack
260+
pxor xmm5, xmm5
261+
movsd QWORD PTR [rsp + 24], xmm4
262+
movsd QWORD PTR [rsp + 32], xmm5
263+
xor rbx, rbx
264+
mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2
265+
sar rax, 1
266+
iter_ct_loop_k:
267+
cmp rbx, rax # Check if rbx is less then stride / 2
268+
je iter_ct_end_k
269+
mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j]
270+
add r8, rbx
271+
sal r8, 4
272+
mov r9, QWORD PTR [rsp + 40]
273+
sal r9, 3
274+
add r9, r8
275+
lea r9, [r12 + r9]
276+
lea r8, [r12 + r8]
277+
movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2]
278+
movsd xmm1, QWORD PTR [r9 + 8]
279+
movsd xmm2, QWORD PTR [rsp + 24]
280+
movsd xmm3, QWORD PTR [rsp + 32]
281+
call __muldc3
282+
movsd xmm2, QWORD PTR [r8]
283+
movsd xmm3, QWORD PTR [r8 + 8]
284+
subsd xmm2, xmm0
285+
subsd xmm3, xmm1
286+
movsd QWORD PTR [r9], xmm2 # Saving answer
287+
movsd QWORD PTR [r9 + 8], xmm3
288+
movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
289+
movsd xmm1, QWORD PTR [r8 + 8]
290+
subsd xmm2, xmm0
291+
subsd xmm3, xmm1
292+
subsd xmm0, xmm2
293+
subsd xmm1, xmm3
294+
movsd QWORD PTR [r8], xmm0 # Saving answer
295+
movsd QWORD PTR [r8 + 8], xmm1
296+
movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w
297+
movsd xmm1, QWORD PTR [rsp + 32]
298+
movsd xmm2, QWORD PTR [rsp + 8]
299+
movsd xmm3, QWORD PTR [rsp + 16]
300+
call __muldc3
301+
movsd QWORD PTR [rsp + 24], xmm0 # Saving answer
302+
movsd QWORD PTR [rsp + 32], xmm1
303+
add rbx, 1
304+
mov rax, QWORD PTR [rsp + 40]
305+
sar rax, 1
306+
jmp iter_ct_loop_k
307+
iter_ct_end_k:
308+
add r15, QWORD PTR [rsp + 40]
309+
jmp iter_ct_loop_j
310+
iter_ct_end_j:
311+
add r14, 1
312+
mov rax, QWORD PTR [rsp]
313+
jmp iter_ct_loop_i
314+
iter_ct_end_i:
315+
add rsp, 48
316+
pop rbx
317+
pop r15
318+
pop r14
319+
pop r13
320+
pop r12
321+
ret
322+
323+
# rdi - array a ptr
324+
# rsi - array b ptr
325+
# rdx - array size
326+
approx:
327+
push r12
328+
push r13
329+
push r14
330+
push r15
331+
mov r12, rdi
332+
mov r13, rsi
333+
mov r14, rdx
334+
lea r15, [rdi + rdx]
335+
sub rsp, 8
336+
approx_loop:
337+
cmp r12, r15
338+
je approx_return
339+
movsd xmm0, QWORD PTR[r13]
340+
movsd xmm1, QWORD PTR[r13 + 8]
341+
call cabs
342+
movsd QWORD PTR [rsp], xmm0
343+
movsd xmm0, QWORD PTR[r12]
344+
movsd xmm1, QWORD PTR[r12 + 8]
345+
call cabs
346+
movsd xmm1, QWORD PTR [rsp]
347+
subsd xmm0, xmm1
348+
mov rdi, OFFSET fmt
349+
mov rax, 1
350+
call printf
351+
add r12, 16
352+
add r13, 16
353+
jmp approx_loop
354+
approx_return:
355+
add rsp, 8
356+
pop r15
357+
pop r14
358+
pop r13
359+
pop r12
360+
ret
361+
362+
main:
363+
push r12
364+
sub rsp, 2048
365+
mov rdi, 0
366+
call time
367+
mov edi, eax
368+
call srand
369+
lea r12, [rsp + 1024]
370+
loop:
371+
cmp r12, rsp
372+
je end_loop
373+
sub r12, 16
374+
call rand
375+
cvtsi2sd xmm0, rax
376+
divsd xmm0, rand_max
377+
lea rax, [r12 + 1024]
378+
movsd QWORD PTR [r12], xmm0
379+
movsd QWORD PTR [rax], xmm0
380+
mov QWORD PTR [r12 + 8], 0
381+
mov QWORD PTR [rax + 8], 0
382+
jmp loop
383+
end_loop:
384+
mov rdi, rsp
385+
mov rsi, 1024
386+
call iterative_cooley_tukey
387+
lea rdi, [rsp + 1024]
388+
mov rsi, 1024
389+
call cooley_tukey
390+
mov rdi, rsp
391+
lea rsi, [rsp + 1024]
392+
mov rdx, 1024
393+
call approx
394+
xor rax, rax
395+
add rsp, 2048
396+
pop r12
397+
ret
398+
399+

0 commit comments

Comments
 (0)