38 registers = [
"ymm0",
"ymm1",
"ymm2",
"ymm3",
"ymm4",
"ymm5",
"ymm6",
"ymm7",
"ymm8",
"ymm9",
"ymm10",
"ymm11",
"ymm12"]
39 reg_locked = [0] * len(registers)
47 if(plimbs > len(registers)):
48 print(
"ERROR: Index out range")
50 state = registers[:plimbs]
58 S = S +
".global mult_avx2_"+ str(plimbs) +
"x" + str(plimbs) +
"\n"
59 S = S +
"mult_avx2_"+ str(plimbs) +
"x" + str(plimbs) +
":\n"
62 countAs = min(len(registers) - plimbs, math.ceil(plimbs/2))
63 S = S +
"############## load as much a's as possible\n"
64 for j
in range(0, countAs):
65 S = S +
" vmovdqa " + registers[j] +
", YMMWORD PTR [rsi + " + str(j*32) +
"] \n"
68 for j
in range(0,plimbs):
70 S = S +
"\n\n############## perm to b_" + str(j) +
"\n"
71 S = S +
" vpshufd ymm15, ymm15, 177\n"
74 S = S +
"\n\n############## load b_" + str(j) +
"\n"
75 S = S +
" vmovdqa ymm15, YMMWORD PTR [rdx + " + str(j//2*32) +
"] \n"
78 S = S +
"############## w/o adds\n"
79 for k
in range(0,plimbs):
81 S = S +
"############## EQUAL\n"
82 S = S +
"############## a_" + str(k) +
" * b_" + str(j) +
"\n"
84 S = S +
"############## read from regs\n"
87 S = S +
"############## perm " + registers[k//2] +
" to a_" + str(k) +
"\n"
88 S = S +
" vpshufd " + registers[k//2] +
", " + registers[k//2] +
", 177\n"
91 S = S +
"############## " + registers[k//2] +
" = a_" + str(k) +
"\n"
93 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" , ymm15, " + registers[k//2] +
"\n"
95 S = S +
"############## read from stack\n"
97 S = S +
"############## perm to a_" + str(k) +
"\n"
98 S = S +
" vpshufd ymm13, ymm13, 177\n"
100 S = S +
" vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) +
"]\n"
101 S = S +
"############## load a_" + str(k) +
"\n"
102 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, ymm13\n"
103 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str(j*32) +
"], " + registers[len(registers)-1 - j] +
"\n"
104 S = S +
" vxorps " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
"\n"
107 S = S +
"############## perm = " + str(perm) +
"\n"
108 S = S +
"############## w/ adds\n"
109 for k
in range(0,plimbs):
113 S = S +
"############## a_" + str(k-1) +
" * b_" + str(j) +
"\n"
115 S = S +
"############## a_" + str(k+1) +
" * b_" + str(j) +
"\n"
117 S = S +
"############## a_" + str(k) +
" * b_" + str(j) +
"\n"
118 S = S +
"############## read from regs\n"
121 S = S +
"############## perm " + registers[k//2] +
" to a_" + str(k-1) +
"\n"
122 S = S +
" vpshufd " + registers[k//2] +
", " + registers[k//2] +
", 177\n"
124 S = S +
"############## save add\n"
126 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
127 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k-1)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k-1)%plimbs)] +
" \n"
130 S = S +
"############## save add\n"
131 S = S +
"############## " + registers[k//2] +
" = a_" + str(k+1) +
"\n"
132 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
133 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k+1)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k+1)%plimbs)] +
" \n"
137 S = S +
"############## save add\n"
138 S = S +
"############## perm " + registers[k//2] +
" to a_" + str(k) +
"\n"
139 S = S +
" vpshufd " + registers[k//2] +
", " + registers[k//2] +
", 177\n"
140 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
141 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" \n"
143 S = S +
"############## " + registers[k//2] +
" = a_" + str(k) +
"\n"
144 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
145 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" \n"
149 S = S +
"############## a_" + str(k) +
" * b_" + str(j) +
"\n"
150 S = S +
"############## read from stack\n"
152 S = S +
"############## perm to a_" + str(k) +
"\n"
153 S = S +
" vpshufd ymm13, ymm13, 177\n"
156 S = S +
"############## load a_" + str(k) +
"\n"
157 S = S +
" vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) +
"]\n"
159 S = S +
"############## save add\n"
160 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, ymm13\n"
162 S = S +
" vpmuldq ymm14, ymm15, ymm13\n"
163 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
"\n"
166 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str(j*32) +
"] , " + registers[len(registers)-1 - j] +
"\n"
167 S = S +
" vxorps " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
"\n"
170 S = S +
"############## write all the rest\n"
171 for j
in range(0,plimbs-1):
172 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str((plimbs + j)*32) +
"], " + registers[len(registers)- j - 1] +
"\n"
185 registers = [
"ymm0",
"ymm1",
"ymm2",
"ymm3",
"ymm4",
"ymm5",
"ymm6",
"ymm7",
"ymm8",
"ymm9",
"ymm10",
"ymm11",
"ymm12"]
186 reg_locked = [0] * len(registers)
194 if(plimbs > len(registers)):
195 print(
"ERROR: Index out range")
197 state = registers[:plimbs]
205 S = S +
".global mult_avx2_"+ str(plimbs) +
"x" + str(plimbs) +
"\n"
206 S = S +
"mult_avx2_"+ str(plimbs) +
"x" + str(plimbs) +
":\n"
209 countAs = min(len(registers) - plimbs, math.ceil(plimbs/2))
210 S = S +
"############## load as much a's as possible\n"
211 for j
in range(0, countAs):
212 S = S +
" vmovdqa " + registers[j] +
", YMMWORD PTR [rsi + " + str(j*32) +
"] \n"
215 for j
in range(0,plimbs):
217 S = S +
"\n\n############## perm to b_" + str(j) +
"\n"
218 S = S +
" vpshufd ymm15, ymm15, 177\n"
221 S = S +
"\n\n############## load b_" + str(j) +
"\n"
222 S = S +
" vmovdqa ymm15, YMMWORD PTR [rdx + " + str(j//2*32) +
"] \n"
225 S = S +
"############## w/o adds\n"
227 for k
in range(0,plimbs):
228 S = S +
"############## a_" + str(k) +
" * b_" + str(j) +
"\n"
230 S = S +
"############## read from regs\n"
233 S = S +
"############## perm " + registers[k//2] +
" to a_" + str(k) +
"\n"
234 S = S +
" vpshufd " + registers[k//2] +
", " + registers[k//2] +
", 177\n"
237 S = S +
"############## " + registers[k//2] +
" = a_" + str(k) +
"\n"
239 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" , ymm15, " + registers[k//2] +
"\n"
241 S = S +
"############## read from stack\n"
243 S = S +
"############## perm to a_" + str(k) +
"\n"
244 S = S +
" vpshufd ymm13, ymm13, 177\n"
246 S = S +
" vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) +
"]\n"
247 S = S +
"############## load a_" + str(k) +
"\n"
248 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, ymm13\n"
249 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str(j*32) +
"], " + registers[len(registers)-1 - j] +
"\n"
250 S = S +
" vxorps " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
"\n"
253 S = S +
"############## perm = " + str(perm) +
"\n"
254 S = S +
"############## w/ adds\n"
255 for k
in range(0,plimbs):
259 S = S +
"############## a_" + str(k-1) +
" * b_" + str(j) +
"\n"
261 S = S +
"############## a_" + str(k+1) +
" * b_" + str(j) +
"\n"
263 S = S +
"############## a_" + str(k) +
" * b_" + str(j) +
"\n"
264 S = S +
"############## read from regs\n"
267 S = S +
"############## perm " + registers[k//2] +
" to a_" + str(k-1) +
"\n"
268 S = S +
" vpshufd " + registers[k//2] +
", " + registers[k//2] +
", 177\n"
270 S = S +
"############## save add\n"
272 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
273 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k-1)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k-1)%plimbs)] +
" \n"
276 S = S +
"############## save add\n"
277 S = S +
"############## " + registers[k//2] +
" = a_" + str(k+1) +
"\n"
278 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
279 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k+1)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k+1)%plimbs)] +
" \n"
283 S = S +
"############## save add\n"
284 S = S +
"############## perm " + registers[k//2] +
" to a_" + str(k) +
"\n"
285 S = S +
" vpshufd " + registers[k//2] +
", " + registers[k//2] +
", 177\n"
286 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
287 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" \n"
289 S = S +
"############## " + registers[k//2] +
" = a_" + str(k) +
"\n"
290 S = S +
" vpmuldq ymm14, ymm15, " + registers[k//2] +
"\n"
291 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" \n"
295 S = S +
"############## a_" + str(k) +
" * b_" + str(j) +
"\n"
296 S = S +
"############## read from stack\n"
298 S = S +
"############## perm to a_" + str(k) +
"\n"
299 S = S +
" vpshufd ymm13, ymm13, 177\n"
302 S = S +
"############## load a_" + str(k) +
"\n"
303 S = S +
" vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) +
"]\n"
305 S = S +
"############## save add\n"
306 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, ymm13\n"
308 S = S +
" vpmuldq ymm14, ymm15, ymm13\n"
309 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
"\n"
312 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str(j*32) +
"] , " + registers[len(registers)-1 - j] +
"\n"
313 S = S +
" vxorps " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
", " + registers[len(registers)-1 - j] +
"\n"
316 S = S +
"############## write all the rest\n"
317 for j
in range(0,plimbs-1):
318 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str((plimbs + j)*32) +
"], " + registers[len(registers)- j - 1] +
"\n"
351 registers = [
"ymm0",
"ymm1",
"ymm2",
"ymm3",
"ymm4",
"ymm5",
"ymm6",
"ymm7",
"ymm8",
"ymm9",
"ymm10",
"ymm11",
"ymm12",
"ymm13"]
355 S = S +
".global sub_"+ str(plimbs) +
"x" + str(plimbs) +
"_avx2\n"
357 S = S +
"sub_"+ str(plimbs) +
"x" + str(plimbs) +
"_avx2:\n"
358 for i
in range(0, plimbs):
359 S = S +
" vmovdqa " + registers[i%len(registers)] +
", ymmword ptr [rdi + " + str(i*32) +
"]\n"
360 S = S +
" vpsubd " + registers[i%len(registers)] +
", " + registers[i%len(registers)] +
", ymmword ptr [rsi + " + str(i*32) +
"]\n"
361 S = S +
" vpsubd " + registers[i%len(registers)] +
", " + registers[i%len(registers)] +
", ymmword ptr [rdx + " + str(i*32) +
"]\n"
362 S = S +
" vmovdqa ymmword ptr [rdi + " + str(i*32) +
"], " + registers[i%len(registers)] +
"\n"
606 registers = [
"ymm0",
"ymm1",
"ymm2",
"ymm3",
"ymm4",
"ymm5",
"ymm6",
"ymm7",
"ymm8",
"ymm9",
"ymm10",
"ymm11",
"ymm12",
"ymm13"]
607 reg_locked = [0] * 14
612 if(plimbs > len(registers)):
613 print(
"ERROR: Index out range")
615 state = registers[:plimbs]
623 S = S +
".global mult_avx2_"+ str(plimbs) +
"x" + str(plimbs) +
"\n"
624 S = S +
"mult_avx2_"+ str(plimbs) +
"x" + str(plimbs) +
":\n"
627 countAs = len(registers) - plimbs
628 for j
in range(0, countAs):
629 S = S +
" vmovdqa " + registers[j] +
", YMMWORD PTR [rsi + " + str(j*32) +
"] \n"
633 for j
in range(0,plimbs):
637 S = S +
" vmovdqa ymm15, YMMWORD PTR [rdx + " + str(j*32) +
"] \n"
639 for k
in range(0,plimbs):
642 if(reg_locked[len(registers)-1 - ((j+k)%plimbs)] == 1):
646 S = S +
" vpmuldq ymm14, ymm15, YMMWORD PTR [rsi + " + str(k*32) +
"]\n"
649 S = S +
" vpmuldq ymm14, " + registers[k] +
", ymm15 \n"
651 S = S +
" vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" \n"
652 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
654 S = S +
"################\n EXTRA LOAD 1\n"
658 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, YMMWORD PTR [rsi + " + str(k*32) +
"]\n"
659 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
662 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, " + registers[k] +
"\n"
663 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
665 S = S +
"################\n EXTRA LOAD 2\n"
671 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
", ymm15, YMMWORD PTR [rsi + " + str(k*32) +
"]\n"
673 S = S +
" vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +
" , ymm15, " + registers[k] +
"\n"
674 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
676 S = S +
"################\n EXTRA LOAD 3\n"
678 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str(j*32) +
"] , " + registers[len(registers)-1 - j] +
"\n"
680 reg_locked[len(registers)-1 - j] = 0
684 for j
in range(0,plimbs-1):
685 S = S +
" vmovdqu YMMWORD PTR [rdi + " + str((plimbs + j)*32) +
"], " + registers[len(registers)- j - 1] +
"\n"
824 registers = [
"rax",
"rbx",
"rcx",
"r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15"]
830 S = S +
".global sub_d_"+ str(plimbs + 2) +
"x" + str(plimbs) +
"_woc\n"
831 S = S +
"sub_d_"+ str(plimbs + 2) +
"x" + str(plimbs) +
"_woc:\n"
836 limbs_counter = plimbs
840 while(limbs_counter > 0)
and reg_counter < len(registers):
841 S = S +
" mov " + registers[reg_counter] +
", [rdi + " + str(plimbs - limbs_counter) +
"*8]\n"
846 for i
in range(0,math.ceil(plimbs/(len(registers)-1))):
849 S = S +
"# ------------------\n"
850 while(limbs_counter > 0)
and reg_counter < len(registers)-1:
851 S = S +
" mov " + registers[reg_counter] +
", [rdi + " + str(plimbs - limbs_counter) +
"*8]\n"
854 registers =
rotate(registers, 1)
857 S = S +
"# ------------------\n"
858 for j
in range(0, reg_counter):
859 if(j == 0)
and (i == 0):
860 S = S +
" sub " + registers[j] +
", [rsi + " + str(j + (i*len(registers))) +
"*8]\n"
861 S = S +
" sbb " + registers[j+1] +
", 0\n"
862 S = S +
" sub " + registers[j] +
", [rdx + " + str(j + (i*len(registers))) +
"*8]\n"
864 S = S +
" sbb " + registers[j] +
", [rsi + " + str(j + (i*(len(registers)-1))) +
"*8]\n"
865 S = S +
" sbb " + registers[j+1] +
", 0\n"
866 S = S +
" sub " + registers[j] +
", [rdx + " + str(j + (i*(len(registers)-1))) +
"*8]\n"
867 S = S +
"# ------------------\n"
868 for k
in range(0, reg_counter):
869 S = S +
" mov [rdi + " + str(k + (i*(len(registers)-1))) +
"*8], " + registers[k] +
"\n"
873 S = S +
" sbb " + registers[reg_counter] +
", 0\n"
874 S = S +
" mov [rdi + " + str(reg_counter + (i*(len(registers)-1))) +
"*8], " + registers[reg_counter] +
"\n"
887 registers = [
"rax",
"rbx",
"rcx",
"r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15"]
893 S = S +
".global sub_d_"+ str(plimbs + 1) +
"x" + str(plimbs + 1) +
"_woc\n"
894 S = S +
"sub_d_"+ str(plimbs + 1) +
"x" + str(plimbs + 1) +
"_woc:\n"
899 limbs_counter = plimbs
903 while(limbs_counter > 0)
and reg_counter < len(registers):
904 S = S +
" mov " + registers[reg_counter] +
", [rdi + " + str(plimbs - limbs_counter) +
"*8]\n"
909 for i
in range(0,((plimbs-1)//(len(registers)-1))):
913 S = S +
"# ------------------ " + str(i) +
"\n"
914 while(limbs_counter > 0)
and reg_counter < len(registers)-1:
915 S = S +
" mov " + registers[reg_counter] +
", [rdi + " + str(plimbs - limbs_counter) +
"*8]\n"
918 registers =
rotate(registers, 1)
921 S = S +
"# ------------------\n"
922 for j
in range(0, reg_counter):
923 if(j == 0)
and (i == 0):
924 S = S +
" sub " + registers[j] +
", [rsi + " + str(j + (i*len(registers))) +
"*8]\n"
925 S = S +
" sbb " + registers[j+1] +
", 0\n"
926 S = S +
" sub " + registers[j] +
", [rdx + " + str(j + (i*len(registers))) +
"*8]\n"
928 S = S +
" sbb " + registers[j] +
", [rsi + " + str(j + (i*(len(registers)-1))) +
"*8]\n"
929 S = S +
" sbb " + registers[j+1] +
", 0\n"
930 S = S +
" sub " + registers[j] +
", [rdx + " + str(j + (i*(len(registers)-1))) +
"*8]\n"
931 S = S +
"# ------------------\n"
932 for k
in range(0, reg_counter):
933 S = S +
" mov [rdi + " + str(k + (i*(len(registers)-1))) +
"*8], " + registers[k] +
"\n"
935 S = S +
"# last loop\n"
937 i = (plimbs-1)//(len(registers)-1)
941 S = S +
"# ------------------\n"
942 while(limbs_counter > 0)
and reg_counter < len(registers)-1:
943 S = S +
" mov " + registers[reg_counter] +
", [rdi + " + str(plimbs - limbs_counter) +
"*8]\n"
946 registers =
rotate(registers, 1)
949 S = S +
"# ------------------\n"
950 for j
in range(0, reg_counter - 1):
951 if(j == 0)
and (i == 0):
952 S = S +
" sub " + registers[j] +
", [rsi + " + str(j + (i*len(registers))) +
"*8]\n"
953 S = S +
" sbb " + registers[j+1] +
", 0\n"
954 S = S +
" sub " + registers[j] +
", [rdx + " + str(j + (i*len(registers))) +
"*8]\n"
956 S = S +
" sbb " + registers[j] +
", [rsi + " + str(j + (i*(len(registers)-1))) +
"*8]\n"
957 S = S +
" sbb " + registers[j+1] +
", 0\n"
958 S = S +
" sub " + registers[j] +
", [rdx + " + str(j + (i*(len(registers)-1))) +
"*8]\n"
959 S = S +
"# ------------------\n"
960 for k
in range(0, reg_counter):
961 S = S +
" mov [rdi + " + str(k + (i*(len(registers)-1))) +
"*8], " + registers[k] +
"\n"
966 S = S +
" sbb " + registers[reg_counter - 1] +
", 0\n"
967 S = S +
" sub " + registers[reg_counter - 1] +
", [rsi + " + str(reg_counter - 1 + (i*(len(registers)-1))) +
"*8]\n"
968 S = S +
" sbb " + registers[reg_counter] +
", 0\n"
969 S = S +
" sub " + registers[reg_counter] +
", [rsi + " + str(reg_counter + (i*(len(registers)-1))) +
"*8]\n"
970 S = S +
" mov [rdi + " + str(reg_counter - 1 + (i*(len(registers)-1))) +
"*8], " + registers[reg_counter - 1] +
"\n"
971 S = S +
" mov [rdi + " + str(reg_counter + (i*(len(registers)-1))) +
"*8], " + registers[reg_counter] +
"\n"