16 S =
".global " + Var +
"\n" + Var +
":\n"
20 for i
in range(1,pwords+1):
21 if (((i - 1)%4) == 0):
27 S = S +
"0x{:X}".format(PWord[i-1]) +
","
30 S = S +
"0x{:X}".format(PWord[i-1]) +
" \n"
42 S =
".intel_syntax noprefix\n\n"
43 S = S +
".section .rodata\n\n"
44 S = S +
".set pbits," + str(pbits) +
"\n"
45 S = S +
".set pbytes," + str(pbytes) +
"\n"
46 S = S +
".set plimbs," + str(plimbs) +
"\n\n"
53 S =
".intel_syntax noprefix\n\n"
54 S = S +
".section .rodata\n\n"
55 S = S +
".global uintbig_1\nuintbig_1:\n"
56 S = S +
" .quad 1, 0, 0, 0\n"
59 for i
in range(1, N+1):
60 S = S +
" .quad 0, 0, 0, 0\n"
69 S = S +
".section .text\n\n"
71 S = S +
".global uintbig_add\nuintbig_add:\n mov rax, [rsi + 0]\n add rax, [rdx + 0]\n mov [rdi + 0], rax\n .set k, 1\n"
73 S = S +
" .rept " + str(plimbs-1) +
"\n"
75 S = S +
"mov rax, [rsi + 8*k]\n adc rax, [rdx + 8*k]\n mov [rdi + 8*k], rax\n .set k, k+1\n .endr\n setc al\n movzx rax, al\n ret\n\n"
77 S = S +
".global uintbig_sub\nuintbig_sub:\n mov rax, [rsi + 0]\n sub rax, [rdx + 0]\n mov [rdi + 0], rax\n .set k, 1\n"
79 S = S +
" .rept " + str(plimbs-1) +
"\n"
81 S = S +
" mov rax, [rsi + 8*k]\n sbb rax, [rdx + 8*k]\n mov [rdi + 8*k], rax\n .set k, k+1\n .endr\n setc al\n movzx rax, al\n ret"
89 S =
".section .data\n\n"
90 S = S +
".global fpadd\nfpadd:\n .quad 0\n\n"
91 S = S +
".global fpsqr\nfpsqr:\n .quad 0\n\n"
92 S = S +
".global fpmul\nfpmul:\n .quad 0\n\n"
93 S = S +
".section .text\n\n.p2align 4,,15\n\n"
94 S = S +
".global init_counters\ninit_counters:\n movq [rip + fpadd], 0\n movq [rip + fpsqr], 0\n movq [rip + fpmul], 0\n ret\n\n"
95 S = S +
".global fp_copy\nfp_copy:\n cld\n mov rcx, plimbs\n rep movsq\n ret\n\n"
96 S = S +
".global fp_cswap\nfp_cswap:\n movzx rax, dl\n neg rax\n .set k, 0\n .rept plimbs\n mov rcx, [rdi + 8*k]\n"
97 S = S +
"mov rdx, [rsi + 8*k]\n mov r8, rcx\n xor r8, rdx\n and r8, rax\n\n xor rcx, r8\n xor rdx, r8\n\n"
98 S = S +
" mov [rdi + 8*k], rcx\n mov [rsi + 8*k], rdx\n\n .set k, k+1\n .endr\n ret\n\n"
106 Reg_Ar = [
"rdi",
"rsi",
"rdx",
"rcx",
"r8",
"r9",
"r10",
"r11"]
108 S =
".reduce_once:\n push rbp\n mov rbp, rdi\n\n mov rdi, [rbp + 0]\n sub rdi, [rip + p + 0]\n"
112 for i
in range(8,N+1,8):
113 S = S +
" mov " + Reg_Ar[(int(i/8) % 8)] +
", [rbp + " + str(i) +
"]\n sbb " + Reg_Ar[(int(i/8) % 8)] +
", [rip + p + " + str(i) +
"]\n"
114 if ((int((i+8)/8)%8) == 0)
and (i != 0):
117 S = S +
"\n setnc al\n movzx rax, al\n neg rax\n\n"
118 S = S +
".macro cswap2, r, m\n xor \\r, \\m\n and \\r, rax\n xor \\m, \\r\n.endm\n\n\n"
120 n = -(int(pbytes/8) % 8) +8
122 for i
in range(1, n+1):
123 S = S +
" cswap2 " + Reg_Ar[i-1] +
", [rbp + " + str((pbytes-n*8)+(i-1)*8) +
"]\n"
128 for j
in range(1, M+1):
130 S = S +
"\n mov rdi, [rbp + 0]\n sub rdi, [rip + p + 0]\n"
131 for i
in range(8,N+1,8):
132 S = S +
" mov " + Reg_Ar[int(i/8) % 8] +
", [rbp + " + str(i) +
"]\n sbb " + Reg_Ar[int(i/8) % 8] +
", [rip + p + " + str(i) +
"]\n"
133 if ((int((i+8)/8)%8) == 0)
and (i != 0):
136 S = S +
" cswap2 " + Reg_Ar[i-1] +
", [rbp + " + str((N-64)+i*8) +
"]\n"
139 S = S +
" pop rbp\n ret\n\n"
147 S=
".global fp_add2\nfp_add2:\n mov rdx, rdi\n\n"
148 S = S +
".global fp_add\nfp_add:\n push rdi\n call uintbig_add\n pop rdi\n\n incq [rip + fpadd]\n\n jmp .reduce_once\n\n"
149 S = S +
".global fp_sub2\nfp_sub2:\n mov rdx, rdi\n xchg rsi, rdx\n\n.global fp_sub\nfp_sub:\n push rdi\n call uintbig_sub\n pop rdi\n\n\n"
150 S = S +
" incq [rip + fpadd] /* increasing number of additions performed */\n\n neg rax\n\n sub rsp, pbytes\n\n"
151 S = S +
" mov rcx, [rip + p + 0]\n and rcx, rax\n mov [rsp + 0],rcx\n .set k, 1\n .rept plimbs-1\n mov rcx, [rip + p + 8*k]\n and rcx, rax\n"
152 S = S +
" mov [rsp + 8*k], rcx\n .set k, k+1\n .endr\n\n"
153 S = S +
" mov rcx, [rsp + 0]\n add rcx, [rdi + 0]\n mov [rdi + 0], rcx\n .set k, 1\n .rept plimbs-1\n mov rcx, [rsp + 8*k]\n"
154 S = S +
" adc rcx, [rdi + 8*k]\n mov [rdi + 8*k], rcx\n .set k, k+1\n .endr\n\n add rsp, pbytes\n ret\n\n\n"
155 S = S +
"/* Montgomery arithmetic */\n\n.global fp_enc\nfp_enc:\n lea rdx, [rip + r_squared_mod_p]\n jmp fp_mul\n\n"
156 S = S +
".global fp_dec\nfp_dec:\n lea rdx, [rip + uintbig_1]\n jmp fp_mul\n\n\n"
164 Reg_Ar = [
"rbx",
"rcx"]
166 S =
".global fp_mul2\nfp_mul2:\n mov rdx, rdi\n.global fp_mul\nfp_mul:\n push rbp\n push rbx\n\n"
167 S = S +
" incq [rip + fpmul] /* increasing number of multiplications performed */\n\n"
169 S = S +
" sub rsp, " + str(pbytes + 16) +
"\n mov [rsp+ " + str(pbytes+8) +
"],rdi\n mov rdi,rsi\n mov rsi,rdx\n\n\n"
170 S = S +
" xor rax,rax\n"
174 for i
in range(0,N+1,8):
175 S = S +
" mov [rsp+" + str(i) +
"],rax\n"
179 S = S +
"\n\n.macro MULSTEP, k, "
181 for i
in range(0, pwords):
183 R = R +
"I" + str(i) +
","
186 S = S +
"I" + str(pwords) +
"\n\n"
187 S = S +
" mov r11,[rsp+\\I0]\n mov rdx, [rsi + 0]\n mulx rcx, rdx, [rdi + 8*\\k]\n add rdx, r11\n mulx rcx, rdx, [rip + inv_min_p_mod_r]"
188 S = S +
"\n\n xor rax, rax /* clear flags */\n\n\n"
190 S = S +
" mulx rbx, rax, [rip + p + 0]\n adox r11, rax\n mov [rsp+\\I0], r11\n\n"
194 for i
in range(8,N+1,8):
195 S = S +
" mov r11,[rsp+\\I" + str(int(i/8)) +
"]\n mulx " + Reg_Ar[(int(i/8) % 2)] +
", rax, [rip + p + " + str(i) +
"]\n "
196 S = S +
" adcx r11, " + Reg_Ar[-(int(i/8) % 2) +1] +
"\n adox r11, rax\n mov [rsp+\\I" + str(int(i/8)) +
"],r11\n\n"
198 S = S +
"\n mov r11,[rsp+\\I" + str(pwords) +
"]\n mov rax, 0\n adcx r11, rcx\n adox r11, rax\n mov [rsp+\\I" + str(pwords) +
"],r11\n\n"
199 S = S +
" mov rdx, [rdi + 8*\\k]\n\n xor rax, rax /* clear flags */\n\n"
201 S = S +
" mov r11,[rsp+\\I0]\n mulx rbx, rax, [rsi + 0]\n adox r11, rax\n mov [rsp+\\I0],r11\n\n"
203 for i
in range(8,N+1,8):
204 S = S +
" mov r11,[rsp+\\I" + str(int(i/8)) +
"]\n mulx " + Reg_Ar[int(i/8) % 2] +
", rax, [rsi + " + str(i) +
"]\n"
205 S = S +
" adcx r11, " + Reg_Ar[-(int(i/8) % 2) +1] +
"\n adox r11, rax\n mov [rsp+\\I" + str(int(i/8)) +
"],r11\n\n"
207 S = S +
" mov r11,[rsp+\\I" + str(pwords) +
"]\n mov rax, 0\n adcx r11, rcx\n adox r11, rax\n mov [rsp+\\I" + str(pwords) +
"],r11\n\n.endm\n\n"
210 for i
in range(0, pwords):
211 T = T +
"MULSTEP " + str(i) +
","
212 for j
in range(8,pbytes+1,8):
213 T = T + str((j + i*8) % (pbytes + 8)) +
","
216 S = S + str((pbytes +8 + i*8) % (pbytes + 8)) +
"\n"
218 S = S +
"\n\n mov rdi,[rsp+" + str(pbytes+8) +
"]\n\n"
220 for i
in range(0,N+1,8):
221 S = S +
" mov r11,[rsp+" + str(i) +
"]\n mov [rdi+" + str(i) +
"],r11\n"
224 S = S +
" add rsp," + str(pbytes+16) +
"\n\n pop rbx\n pop rbp\n\n jmp .reduce_once\n\n\n"
225 S = S +
".global fp_sq1\nfp_sq1:\n mov rsi, rdi\n.global fp_sqr\nfp_sqr:\n mov rdx, rsi\n\n decq [rip + fpmul]\n incq [rip + fpsqr]\n\n jmp fp_mul\n"
235 S =
".global fp_pow\nfp_pow:\n push rbx\n mov rbx, rsi\n push r12\n push r13\n push rdi\n sub rsp, pbytes\n\n"
236 S = S +
" mov rsi, rdi\n mov rdi, rsp\n call fp_copy\n\n mov rdi, [rsp + pbytes]\n lea rsi, [rip + fp_1]\n call fp_copy\n\n"
237 S = S +
".macro POWSTEP, k\n mov r13, [rbx + 8*\\k]\n xor r12, r12\n\n"
238 S = S +
" 0:\n test r13, 1\n jz 1f\n\n mov rdi, [rsp + pbytes]\n mov rsi, rsp\n call fp_mul2\n\n"
239 S = S +
" 1:\n mov rdi, rsp\n call fp_sq1\n\n shr r13\n\n inc r12\n test r12, 64\n jz 0b\n.endm\n\n"
241 for i
in range(0, pwords):
242 S = S +
" POWSTEP " + str(i) +
"\n"
244 S = S +
" add rsp, pbytes+8\n pop r13\n pop r12\n pop rbx\n ret\n\n\n"
251 S =
".global fp_inv\nfp_inv:\n lea rsi, [rip + p_minus_2]\n jmp fp_pow\n\n\n"
252 S = S +
".global fp_issquare\nfp_issquare:\n push rdi\n lea rsi, [rip + p_minus_1_halves]\n call fp_pow\n pop rdi\n\n"
253 S = S +
" xor rax, rax\n .set k, 0\n .rept plimbs\n mov rsi, [rdi + 8*k]\n xor rsi, [rip + fp_1 + 8*k]\n or rax, rsi\n"
254 S = S +
" .set k, k+1\n .endr\n test rax, rax\n setz al\n movzx rax, al\n ret\n\n\n"
256 S = S +
".global fp_random\nfp_random:\n\n push rdi\n mov rsi, pbytes\n call randombytes\n pop rdi\n"
257 S = S +
" mov rax, 1\n shl rax, (pbits % 64)\n dec rax\n and [rdi + pbytes-8], rax\n\n .set k, plimbs-1\n .rept plimbs\n"
258 S = S +
" mov rax, [rip + p + 8*k]\n cmp [rdi + 8*k], rax\n jge fp_random\n jl 0f\n .set k, k-1\n .endr\n 0:\n ret"
268 RR = IntegerModRing(R)
275 pinv = int(RR(-p).inverse_of_unit())
282 S =
".global fp_0\nfp_0:\n.zero pbytes\n\n"
302 phalvesWords =
WordExtractor((p-1)//2, pwords,
"p_minus_1_halves")
305 pquartersWords =
WordExtractor((p-3)//4, pwords,
"p_minus_3_quarters")
316 pbits = math.ceil(math.log(p, 2.0))
317 pbytes = math.ceil(pbits/8)
319 pbytes = pbytes + 4 - (pbytes%4)
320 pwords = math.ceil(pbits/64)
349p_2047d221 = 0x5160D4543A2596D320C080B284C0FA5D3600AE4E29B85374858B238036139EA0B8B0C8B2850475382865FD4C9F7C3B5E531ED7D0FC022A13270300584EC78190FD09755A56CFEB1FC6961581CDFC56E824D0F31C4D4ECF04C5243CA0651820AF413023A7310203F74858FBECACA26B375BEBA9DE78CC420A069477B7FE595F83B148223C6841B3592C74AF79F39AE8F3D64F8B9FC946BB1C84A4541CBC2F363029B2C1E296158774A9646D2E186AD699B304FC7311F0DEC85E651756DDB4E3888D02333D591583AE5DB2F656E63A6179CDB059ED9BF90BAD614DCA5628C940C5004D99FB1CB03CE478F65726B12E42FA1C7C8FFFFFFFFFFFFFFFFFFFFFFFFFFF
351p_4095d221 = 0x
352p_5119d256 = 0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE89A36D0A7F27637A69FA7A62E0497BC63E3F1C816F32AEAF17F7839D6750A178BB4B691A9922EAB7C9634D0F1491712278F84DA8FC1F472A05D101B8BE9CA1C44282F3AB0B081029CA3967363106F6FCB8542CFEBF9ED83C4B3A35DB0FC42E1FC21CFD1182E49B993A080BC3275B94EA17F8FBB0D7D4BCB3049D6C140A1869043F6460543CD4FEEDEED8D712D6419FE70166C3B8AECA00FACF147447A84B234E2851C58EAE204D626ACE863146FB74EB684C022F3BF3B8BF2CDA4F7A93742D0FB1BCEB40E0E8B01D90C4AFF7ED5B0050F67E9BF0C4A45B73197DE6FF68894CAE2C7B5846A541E178D4FCD073E1282F4488C33A0BFD91BFD26762233E12C84111AEB5BCD7C4C4A5ABFB99C416A83A8637CCF0F4A50FB6B4281CAF7EE5290777B47
353p_6143d306 = 0x
354p_8191d306 = 0x
355p_9215d384 = 0x
366n = math.ceil(math.log(p, 2.0))
Print_Parameters(p, pbits, pbytes, pwords)
PrintHeader(pbits, pbytes, plimbs)
WordExtractor(p, pwords, Var)