|
54 | 54 | #define kernel_kstart_n10(mdim,updk) ""
|
55 | 55 | #define kernel_kstart_n12(mdim,updk) ""
|
56 | 56 | #define kernel_kend_n4(mdim) "xorq %3,%3;"\
|
57 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ |
58 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) |
| 57 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\ |
| 58 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) |
59 | 59 | #define kernel_kend_n6(mdim) "xorq %3,%3;"\
|
60 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ |
61 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ |
62 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ |
63 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) |
| 60 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\ |
| 61 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\ |
| 62 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\ |
| 63 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) |
64 | 64 | #define kernel_kend_n8(mdim) "xorq %3,%3;"\
|
65 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ |
66 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ |
67 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ |
68 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ |
69 |
| - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ |
70 |
| - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) |
| 65 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\ |
| 66 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\ |
| 67 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\ |
| 68 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\ |
| 69 | + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\ |
| 70 | + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) |
71 | 71 | #define kernel_kend_n10(mdim) "xorq %3,%3;"\
|
72 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ |
73 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ |
74 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ |
75 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ |
76 |
| - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ |
77 |
| - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ |
78 |
| - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ |
79 |
| - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) |
| 72 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\ |
| 73 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\ |
| 74 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\ |
| 75 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\ |
| 76 | + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\ |
| 77 | + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\ |
| 78 | + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\ |
| 79 | + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) |
80 | 80 | #define kernel_kend_n12(mdim) "xorq %3,%3;"\
|
81 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ |
82 |
| - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ |
83 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ |
84 |
| - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ |
85 |
| - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ |
86 |
| - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ |
87 |
| - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ |
88 |
| - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ |
89 |
| - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ |
90 |
| - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) |
| 81 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\ |
| 82 | + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\ |
| 83 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\ |
| 84 | + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\ |
| 85 | + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\ |
| 86 | + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\ |
| 87 | + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\ |
| 88 | + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\ |
| 89 | + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\ |
| 90 | + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144) |
91 | 91 | #endif
|
92 | 92 | #else
|
93 | 93 | #define HEAD_SET_OFF(ndim) {}
|
|
129 | 129 | #define init_update_k(mdim) ""
|
130 | 130 | #define save_update_k(mdim) ""
|
131 | 131 | #endif
|
132 |
| - |
| 132 | + |
133 | 133 | #define KERNEL_h_k1m16n1 \
|
134 | 134 | "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\
|
135 | 135 | "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;"
|
136 | 136 | #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;"
|
137 |
| -#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ |
| 137 | +#ifdef BROADCAST_KERNEL |
| 138 | + #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ |
138 | 139 | "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;"
|
139 |
| -#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" |
140 |
| -#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ |
| 140 | + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ |
141 | 141 | "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\
|
142 |
| - "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" |
143 |
| -#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) |
| 142 | + "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" |
| 143 | + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) |
| 144 | +#else |
| 145 | + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ |
| 146 | + "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\ |
| 147 | + "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";" |
| 148 | + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) |
| 149 | + #define KERNEL_h_k1m16n2 \ |
| 150 | + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ |
| 151 | + unit_acc_m16n2(8,9,10,11,%1) |
| 152 | +#endif |
| 153 | +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" |
144 | 154 | #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
|
145 | 155 | #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
|
146 | 156 | #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;"
|
|
151 | 161 | #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2)
|
152 | 162 | #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;"
|
153 | 163 | #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
|
| 164 | + #ifdef BROADCAST_KERNEL |
154 | 165 | #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;"
|
155 |
| - #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) |
156 |
| - #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) |
157 |
| - #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) |
158 |
| - #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) |
159 |
| - #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) |
| 166 | + #else |
| 167 | + #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;" |
| 168 | + #endif |
| 169 | + #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1) |
| 170 | + #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2) |
| 171 | + #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15) |
| 172 | + #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1) |
| 173 | + #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2) |
160 | 174 | #endif
|
161 | 175 | #define save_init_m16 "movq %2,%3; addq $128,%2;"
|
162 | 176 | #ifdef TRMMKERNEL
|
163 | 177 | #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
|
| 178 | + #ifdef BROADCAST_KERNEL |
164 | 179 | #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
|
165 | 180 | "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
|
166 | 181 | "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
|
| 182 | + #else |
| 183 | + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ |
| 184 | + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ |
| 185 | + "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ |
| 186 | + "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" |
| 187 | + #endif |
167 | 188 | #else
|
168 | 189 | #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
|
| 190 | + #ifdef BROADCAST_KERNEL |
169 | 191 | #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
|
170 | 192 | "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
|
171 | 193 | "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
|
| 194 | + #else |
| 195 | + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ |
| 196 | + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ |
| 197 | + "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ |
| 198 | + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" |
| 199 | + #endif |
172 | 200 | #endif
|
173 | 201 | #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11)
|
174 | 202 | #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15)
|
|
206 | 234 | #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
|
207 | 235 | #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
|
208 | 236 | #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;"
|
209 |
| - #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) |
210 |
| - #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) |
211 |
| - #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) |
212 |
| - #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) |
213 |
| - #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) |
| 237 | + #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) |
| 238 | + #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) |
| 239 | + #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15) |
| 240 | + #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) |
| 241 | + #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) |
214 | 242 | #endif
|
215 | 243 | #define save_init_m8 "movq %2,%3; addq $64,%2;"
|
216 | 244 | #ifdef TRMMKERNEL
|
|
258 | 286 | #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
|
259 | 287 | #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
|
260 | 288 | #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
|
261 |
| - #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) |
262 |
| - #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) |
263 |
| - #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) |
264 |
| - #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) |
265 |
| - #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) |
| 289 | + #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) |
| 290 | + #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) |
| 291 | + #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15) |
| 292 | + #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) |
| 293 | + #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) |
266 | 294 | #endif
|
267 | 295 | #define save_init_m4 "movq %2,%3; addq $32,%2;"
|
268 | 296 | #ifdef TRMMKERNEL
|
|
311 | 339 | #define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
|
312 | 340 | #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
|
313 | 341 | #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
|
314 |
| - #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) |
315 |
| - #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) |
316 |
| - #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) |
317 |
| - #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) |
318 |
| - #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) |
| 342 | + #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) |
| 343 | + #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) |
| 344 | + #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15) |
| 345 | + #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) |
| 346 | + #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) |
319 | 347 | #endif
|
320 | 348 | #define save_init_m2 "movq %2,%3; addq $16,%2;"
|
321 | 349 | #ifdef TRMMKERNEL
|
|
362 | 390 | #define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
|
363 | 391 | #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
|
364 | 392 | #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
|
365 |
| - #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" |
366 |
| - #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" |
367 |
| - #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" |
368 |
| - #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" |
369 |
| - #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" |
| 393 | + #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" |
| 394 | + #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" |
| 395 | + #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" |
| 396 | + #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" |
| 397 | + #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" |
370 | 398 | #endif
|
371 | 399 | #define save_init_m1 "movq %2,%3; addq $8,%2;"
|
372 | 400 | #ifdef TRMMKERNEL
|
|
0 commit comments