Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
AsmAVX2Codegenerator Namespace Reference

Functions

 rotate (l, x)
 push ()
 pop ()
 PrintSquaringInterleaved (plimbs)
 PrintMultInterleaved (plimbs)
 Print2xAddAVX2 (plimbs)
 PrintSubAVX2 (plimbs)
 PrintInterleave_4x_10x10 ()
 PrintDeInterleave_4x_10x10 ()
 PrintCarryVertical64 ()
 PrintMult (plimbs)
 PrintAdd (plimbs)
 PrintAddOdd (plimbs)
 PrintSubOdd (plimbs)
 PrintSub (plimbs)
 Print_Assembly (plimbs)
 main (argv)

Variables

 sefOfLimbs = set()
int printOut = 0

Function Documentation

◆ main()

main ( argv)

Definition at line 1065 of file AsmAVX2Codegenerator.py.

1065def main(argv):
1066 global printOut
1067 if(len(argv) != 2):
1068 print(
1069 "\nplease specify number of limbs and type of execution [1 = only printing functions out, 0 = print full assembly]\n")
1070 sys.exit()
1071 else:
1072 printOut = int(argv[1])
1073
1074 # pbits = math.ceil(math.log(p, 2.0))
1075 # pbytes = math.ceil(pbits/8)
1076 # if((pbytes%4) != 0):
1077 # pbytes = pbytes + 4 - (pbytes%4)
1078 # pwords = math.ceil(pbits/64)
1079
1080 S = ".intel_syntax noprefix\n\n"
1081 S = S + ".section .rodata\n\n"
1082 S = S + ".section .text\n\n"
1083
1084 print(S)
1085
1086 # PrintMult(int(argv[0]))
1087 PrintMultInterleaved(int(argv[0]))
1088 # PrintSquaringInterleaved(int(argv[0]))
1089
1090 # Print2xAddAVX2(2)
1091 # PrintSubAVX2(4)
1092 # PrintDeInterleave_4x_10x10()
1093 # Print_Assembly(int(argv[0]))
1094
1095
int main(void)
Definition checkct.c:52
end if

References if, main(), and PrintMultInterleaved().

Here is the call graph for this function:

◆ pop()

pop ( )

Definition at line 25 of file AsmAVX2Codegenerator.py.

25def pop():
26 S = "# -------------------\n"
27 S = S + "# pop\n"
28 # S = S + " pop r15\n pop r14\n pop r13\n pop r12\n pop rsi\n pop rdi\n pop rbp\n pop rbx\n\n"
29 S = S + " pop r15\n pop r14\n pop r13\n pop r12\n pop rsi\n pop rbp\n pop rbx\n\n"
30 # S = S + " pop rsi\n pop rdi\n pop rdx\n\n"
31 return S
32
33

Referenced by PrintSub(), and PrintSubOdd().

Here is the caller graph for this function:

◆ Print2xAddAVX2()

Print2xAddAVX2 ( plimbs)

Definition at line 327 of file AsmAVX2Codegenerator.py.

327def Print2xAddAVX2(plimbs):
328
329 S = ""
330 # add_2x_2x2_avx2
331 S = S + ".global add_2x_"+ str(plimbs) + "x" + str(plimbs) + "_avx2\n"
332
333 S = S + "add_2x_"+ str(plimbs) + "x" + str(plimbs) + "_avx2:\n"
334 for i in range(0, plimbs):
335 S = S + " vmovdqa ymm0, ymmword ptr [rsi + " + str(plimbs*32 + i*32) + "]\n"
336 S = S + " vpaddd ymm0, ymm0, ymmword ptr [rsi + " + str(i*32) + "]\n"
337 S = S + " vmovdqa ymmword ptr [rdi + " + str(i*32) + "], ymm0\n"
338
339 S = S + " vmovdqa ymm1, ymmword ptr [rdx + " + str(plimbs*32 + i*32)+ "]\n"
340 S = S + " vpaddd ymm1, ymm1, ymmword ptr [rdx + " + str(i*32) + "]\n"
341 S = S + " vmovdqa ymmword ptr [rdi + " + str(i*32+(plimbs*32)) + "], ymm1\n"
342
343 S = S + " ret\n"
344
345 print(S)
346
347 return 0
348

◆ Print_Assembly()

Print_Assembly ( plimbs)

Definition at line 981 of file AsmAVX2Codegenerator.py.

981def Print_Assembly(plimbs):
982 global sefOfLimbs
983 global printOut
984 breakOut = 5
985 if(printOut):
986 while 1 :
987 if plimbs > breakOut :
988 print("++++++++++++++++++++++++++")
989 if(plimbs not in sefOfLimbs):
990 sefOfLimbs.add(plimbs)
991 print(str(plimbs) + "x" + str(plimbs))
992 if(plimbs%2!=1):
993 print("add_" + str(plimbs//2) + "x" + str(plimbs//2))
994 print("sub_d_" + str(plimbs + 2) + "x" + str(plimbs) + "_woc")
995 print("add_" + str(plimbs + 2) + "x" + str(plimbs + 1))
996 if(plimbs > breakOut):
997 # print("mult_"+ str(plimbs//2) + "x" + str(plimbs//2))
998 # print("mult_"+ str(plimbs//2+1) + "x" + str(plimbs//2+1))
999 # else:
1000 Print_Assembly(plimbs//2)
1001 Print_Assembly(plimbs//2+1)
1002 else:
1003 print("add_"+ str(plimbs//2 + 1) + "x" + str(plimbs//2))
1004 print("sub_d_"+ str(plimbs + 1) + "x" + str(plimbs + 1) + "_woc")
1005 print("add_"+ str(plimbs + 2) + "x" + str(plimbs + 1))
1006 if(plimbs > breakOut):
1007 # print("mult_"+ str(plimbs//2) + "x" + str(plimbs//2))
1008 # print("mult_"+ str(plimbs//2+1) + "x" + str(plimbs//2+1))
1009 # else:
1010 Print_Assembly(plimbs//2)
1011 Print_Assembly(plimbs//2+1)
1012 # plimbs = plimbs // 2
1013 else:
1014 plimbs = 0
1015 break
1016 else:
1017 if plimbs <= breakOut:
1018 if(plimbs not in sefOfLimbs):
1019 sefOfLimbs.add(plimbs)
1020 print("mult_"+ str(plimbs) + "x" + str(plimbs))
1021 if(plimbs+1 not in sefOfLimbs):
1022 if(plimbs+1 <= breakOut):
1023 sefOfLimbs.add(plimbs+1)
1024 print("mult_"+ str(plimbs+1) + "x" + str(plimbs+1))
1025 # PrintMult(plimbs+1)
1026 break
1027 else:
1028 while 1 :
1029 if plimbs > breakOut :
1030 if(plimbs not in sefOfLimbs):
1031 sefOfLimbs.add(plimbs)
1032 print("# ------------------" + str(plimbs) + "x" + str(plimbs))
1033 if(plimbs%2!=1):
1034 PrintAdd(plimbs//2)
1035 PrintSubOdd(plimbs)
1036 PrintAddOdd(plimbs+1)
1037 if(plimbs > breakOut):
1038 Print_Assembly(plimbs//2)
1039 Print_Assembly(plimbs//2+1)
1040 else:
1041 PrintAddOdd(plimbs//2)
1042 PrintSub(plimbs)
1043 PrintAddOdd(plimbs+1)
1044 if(plimbs > breakOut):
1045 Print_Assembly(plimbs//2)
1046 Print_Assembly(plimbs//2+1)
1047 else:
1048 plimbs = 0
1049 break
1050 else:
1051 if plimbs <= breakOut:
1052 if(plimbs not in sefOfLimbs):
1053 sefOfLimbs.add(plimbs)
1054 PrintMult(plimbs)
1055 if(plimbs+1 not in sefOfLimbs):
1056 if(plimbs+1 <= breakOut):
1057 sefOfLimbs.add(plimbs+1)
1058 PrintMult(plimbs+1)
1059 break
1060
1061
1062
1063#//+++++++++++++++ Main ++++++++++++++++++//
1064

References if, Print_Assembly(), PrintAdd(), PrintAddOdd(), PrintMult(), PrintSub(), and PrintSubOdd().

Referenced by Print_Assembly().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ PrintAdd()

PrintAdd ( plimbs)

Definition at line 694 of file AsmAVX2Codegenerator.py.

694def PrintAdd(plimbs):
695
696 S = ""
697 # S = ".intel_syntax noprefix\n\n"
698 # S = S + ".section .rodata\n\n"
699 # S = S + ".section .text\n\n"
700 S = S + ".global add_"+ str(plimbs) + "x" + str(plimbs) + "\n"
701
702 S = S + "add_"+ str(plimbs) + "x" + str(plimbs) + ":\n"
703 S = S + "# intro\n"
704 S = S + " mov rax, [rsi + 0]\n add rax, [rdx + 0]\n"
705 S = S + " mov [rdi + 0], rax\n"
706 S = S + "# loop\n"
707 S = S + " .set k, 1\n"
708
709 S = S + " .rept " + str(plimbs-1) + "\n"
710
711 S = S + " mov rax, [rsi + 8*k]\n adc rax, [rdx + 8*k]\n"
712 S = S + " mov [rdi + 8*k], rax\n .set k, k+1\n .endr\n"
713 S = S + "# outro\n"
714 S = S + " mov rax, [rdi + " + str(plimbs) + "*8]\n"
715 S = S + " adc rax, 0\n"
716 S = S + " mov [rdi + " + str(plimbs) + "*8], rax\n"
717 # S = S + " setc al\n movzx rax, al\n ret\n\n"
718 S = S + " ret\n"
719
720 print(S)
721 return 0
722
723#
724# add function for "oddly" combinations like ADD_17x16, ADD_9x8
725#

Referenced by Print_Assembly().

Here is the caller graph for this function:

◆ PrintAddOdd()

PrintAddOdd ( plimbs)

Definition at line 726 of file AsmAVX2Codegenerator.py.

726def PrintAddOdd(plimbs):
727
728 S = ""
729 # S = ".intel_syntax noprefix\n\n"
730 # S = S + ".section .rodata\n\n"
731 # S = S + ".section .text\n\n"
732 S = S + ".global add_"+ str(plimbs + 1) + "x" + str(plimbs) + "\n"
733
734 S = S + "add_"+ str(plimbs + 1) + "x" + str(plimbs) + ":\n"
735 S = S + "# intro\n"
736 S = S + " mov rax, [rsi + 0]\n add rax, [rdx + 0]\n"
737 S = S + " mov [rdi + 0], rax\n"
738 S = S + "# loop\n"
739 S = S + " .set k, 1\n"
740
741 S = S + " .rept " + str(plimbs-1) + "\n"
742
743 S = S + " mov rax, [rsi + 8*k]\n adc rax, [rdx + 8*k]\n"
744 S = S + " mov [rdi + 8*k], rax\n .set k, k+1\n .endr\n"
745 S = S + "# outro\n"
746 S = S + " mov rax, [rsi + 8*" + str(plimbs) + "]\n"
747 S = S + " adc rax, 0\n"
748 S = S + " mov [rdi + 8*" + str(plimbs) + "], rax\n"
749 # S = S + " setc al\n movzx rax, al\n ret\n\n"
750 S = S + " ret\n"
751
752
753 print(S)
754 return 0
755
756# def PrintSub(plimbs):
757
758# registers = ["rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"]
759
760# S = ""
761# # S = ".intel_syntax noprefix\n\n"
762# # S = S + ".section .rodata\n\n"
763# # S = S + ".section .text\n\n"
764# S = S + ".global sub_d_"+ str(plimbs + 2) + "x" + str(plimbs + 2) + "_woc\n"
765# S = S + "sub_d_"+ str(plimbs + 2) + "x" + str(plimbs + 2) + "_woc:\n"
766
767# S = S + push()
768
769# plimbs += 2
770# limbs_counter = plimbs
771
772# reg_counter = 0
773# S = S + "# intro\n"
774# while(limbs_counter > 0) and reg_counter < len(registers):
775# S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
776# limbs_counter -= 1
777# reg_counter += 1
778# S = S + "# loop\n"
779# for i in range(0,math.ceil(plimbs/len(registers))):
780# if i > 0:
781# reg_counter = 0
782# S = S + "# -------------------\n"
783# while(limbs_counter > 0) and reg_counter < len(registers)-1:
784# S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
785# limbs_counter -= 1
786# reg_counter += 1
787# registers = rotate(registers, 1)
788# else:
789# reg_counter -= 1
790# S = S + "# -------------------\n"
791# for j in range(0, reg_counter):
792# if(j == 0) and (i == 0):
793# S = S + " sub " + registers[j] + ", [rsi + " + str(j + (i*len(registers))) + "*8]\n"
794# S = S + " sbb " + registers[j+1] + ", 0\n"
795# S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*len(registers))) + "*8]\n"
796# else:
797# S = S + " sbb " + registers[j] + ", [rsi + " + str(j + (i*(len(registers)-1))) + "*8]\n"
798# S = S + " sbb " + registers[j+1] + ", 0\n"
799# S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*(len(registers)-1))) + "*8]\n"
800# S = S + "# ------------------\n"
801# for k in range(0, reg_counter):
802# S = S + " mov [rdi + " + str(k + (i*(len(registers)-1))) + "*8], " + registers[k] + "\n"
803
804# # outro
805# S = S + "# outro\n"
806# S = S + " mov [rdi + " + str(reg_counter + (i*(len(registers)-1))) + "*8], " + registers[reg_counter] + "\n"
807
808# S = S + pop()
809
810# S = S + " ret\n"
811
812
813
814# print(S)
815
816
817
818# return 0
819
820

Referenced by Print_Assembly().

Here is the caller graph for this function:

◆ PrintCarryVertical64()

PrintCarryVertical64 ( )

Definition at line 510 of file AsmAVX2Codegenerator.py.

510def PrintCarryVertical64():
511
512 S = ""
513
514 S = S + ".global carryVertical64_avx2\n"
515 S = S + "carryVertical64_avx2:\n"
516
517 S = S + " vmovdqa ymm1, ymmword ptr [rdi]\n"
518 S = S + " vpsrlq ymm2, ymm1, 26\n"
519 S = S + " vpbroadcastq ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [67108863,67108863,67108863,67108863]\n"
520 S = S + " vpand ymm1, ymm1, ymm0\n"
521 S = S + " vmovdqa ymmword ptr [rdi], ymm1\n"
522 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 32]\n"
523 S = S + " vpsrlq ymm2, ymm1, 26\n"
524 S = S + " vpand ymm1, ymm1, ymm0\n"
525 S = S + " vmovdqa ymmword ptr [rdi + 32], ymm1\n"
526 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 64]\n"
527 S = S + " vpsrlq ymm2, ymm1, 26\n"
528 S = S + " vpand ymm1, ymm1, ymm0\n"
529 S = S + " vmovdqa ymmword ptr [rdi + 64], ymm1\n"
530 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 96]\n"
531 S = S + " vpsrlq ymm2, ymm1, 26\n"
532 S = S + " vpand ymm1, ymm1, ymm0\n"
533 S = S + " vmovdqa ymmword ptr [rdi + 96], ymm1\n"
534 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 128]\n"
535 S = S + " vpsrlq ymm2, ymm1, 26\n"
536 S = S + " vpand ymm1, ymm1, ymm0\n"
537 S = S + " vmovdqa ymmword ptr [rdi + 128], ymm1\n"
538 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 160]\n"
539 S = S + " vpsrlq ymm2, ymm1, 26\n"
540 S = S + " vpand ymm1, ymm1, ymm0\n"
541 S = S + " vmovdqa ymmword ptr [rdi + 160], ymm1\n"
542 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 192]\n"
543 S = S + " vpsrlq ymm2, ymm1, 26\n"
544 S = S + " vpand ymm1, ymm1, ymm0\n"
545 S = S + " vmovdqa ymmword ptr [rdi + 192], ymm1\n"
546 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 224]\n"
547 S = S + " vpsrlq ymm2, ymm1, 26\n"
548 S = S + " vpand ymm1, ymm1, ymm0\n"
549 S = S + " vmovdqa ymmword ptr [rdi + 224], ymm1\n"
550 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 256]\n"
551 S = S + " vpsrlq ymm2, ymm1, 26\n"
552 S = S + " vpand ymm1, ymm1, ymm0\n"
553 S = S + " vmovdqa ymmword ptr [rdi + 256], ymm1\n"
554 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 288]\n"
555 S = S + " vpsrlq ymm2, ymm1, 26\n"
556 S = S + " vpand ymm1, ymm1, ymm0\n"
557 S = S + " vmovdqa ymmword ptr [rdi + 288], ymm1\n"
558 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 320]\n"
559 S = S + " vpsrlq ymm2, ymm1, 26\n"
560 S = S + " vpand ymm1, ymm1, ymm0\n"
561 S = S + " vmovdqa ymmword ptr [rdi + 320], ymm1\n"
562 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 352]\n"
563 S = S + " vpsrlq ymm2, ymm1, 26\n"
564 S = S + " vpand ymm1, ymm1, ymm0\n"
565 S = S + " vmovdqa ymmword ptr [rdi + 352], ymm1\n"
566 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 384]\n"
567 S = S + " vpsrlq ymm2, ymm1, 26\n"
568 S = S + " vpand ymm1, ymm1, ymm0\n"
569 S = S + " vmovdqa ymmword ptr [rdi + 384], ymm1\n"
570 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 416]\n"
571 S = S + " vpsrlq ymm2, ymm1, 26\n"
572 S = S + " vpand ymm1, ymm1, ymm0\n"
573 S = S + " vmovdqa ymmword ptr [rdi + 416], ymm1\n"
574 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 448]\n"
575 S = S + " vpsrlq ymm2, ymm1, 26\n"
576 S = S + " vpand ymm1, ymm1, ymm0\n"
577 S = S + " vmovdqa ymmword ptr [rdi + 448], ymm1\n"
578 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 480]\n"
579 S = S + " vpsrlq ymm2, ymm1, 26\n"
580 S = S + " vpand ymm1, ymm1, ymm0\n"
581 S = S + " vmovdqa ymmword ptr [rdi + 480], ymm1\n"
582 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 512]\n"
583 S = S + " vpsrlq ymm2, ymm1, 26\n"
584 S = S + " vpand ymm1, ymm1, ymm0\n"
585 S = S + " vmovdqa ymmword ptr [rdi + 512], ymm1\n"
586 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 544]\n"
587 S = S + " vpsrlq ymm2, ymm1, 26\n"
588 S = S + " vpand ymm1, ymm1, ymm0\n"
589 S = S + " vmovdqa ymmword ptr [rdi + 544], ymm1\n"
590 S = S + " vpaddq ymm1, ymm2, ymmword ptr [rdi + 576]\n"
591 S = S + " vpsrlq ymm2, ymm1, 26\n"
592 S = S + " vpand ymm0, ymm1, ymm0\n"
593 S = S + " vmovdqa ymmword ptr [rdi + 576], ymm0\n"
594 S = S + " vpaddq ymm0, ymm2, ymmword ptr [rdi + 608]\n"
595 S = S + " vmovdqa ymmword ptr [rdi + 608], ymm0\n"
596 S = S + " ret\n"
597 print(S)
598
599 return 0
600

◆ PrintDeInterleave_4x_10x10()

PrintDeInterleave_4x_10x10 ( )

Definition at line 408 of file AsmAVX2Codegenerator.py.

408def PrintDeInterleave_4x_10x10():
409
410 S = ""
411
412 S = S + ".global deinterleave_4x_10x10_avx2\n"
413 S = S + "deinterleave_4x_10x10_avx2:\n"
414
415 S = S + " vmovdqa ymm0, ymmword ptr [rdi + 32]\n"
416 S = S + " vpsllq ymm0, ymm0, 32\n"
417 S = S + " vpxor ymm0, ymm0, ymmword ptr [rdi]\n"
418 S = S + " vmovdqa ymm1, ymmword ptr [rdi + 96]\n"
419 S = S + " vpsllq ymm1, ymm1, 32\n"
420 S = S + " vpxor ymm1, ymm1, ymmword ptr [rdi + 64]\n"
421 S = S + " vmovdqa ymm2, ymmword ptr [rdi + 160]\n"
422 S = S + " vpsllq ymm2, ymm2, 32\n"
423 S = S + " vpxor ymm2, ymm2, ymmword ptr [rdi + 128]\n"
424 S = S + " vmovdqa ymm3, ymmword ptr [rdi + 224]\n"
425 S = S + " vpsllq ymm3, ymm3, 32\n"
426 S = S + " vpxor ymm3, ymm3, ymmword ptr [rdi + 192]\n"
427 S = S + " vinserti128 ymm4, ymm0, xmm1, 1\n"
428 S = S + " vmovdqa ymmword ptr [rsi], ymm4\n"
429 S = S + " vperm2i128 ymm0, ymm0, ymm1, 49 \n"
430 S = S + " vmovdqa ymmword ptr [rdx], ymm0\n"
431 S = S + " vinserti128 ymm0, ymm2, xmm3, 1\n"
432 S = S + " vmovdqa ymmword ptr [rcx], ymm0\n"
433 S = S + " vperm2i128 ymm0, ymm2, ymm3, 49\n"
434 S = S + " vmovdqa ymmword ptr [r8], ymm0\n"
435 S = S + " vpermpd ymm1, ymmword ptr [rsi], 216\n"
436 S = S + " vpermq ymm2, ymmword ptr [rdx], 216\n"
437 S = S + " vpermpd ymm3, ymmword ptr [rcx], 216\n"
438 S = S + " vpermq ymm0, ymm0, 216 \n"
439 S = S + " vinsertf128 ymm4, ymm1, xmm3, 1\n"
440 S = S + " vmovaps ymmword ptr [rsi], ymm4\n"
441 S = S + " vperm2f128 ymm1, ymm1, ymm3, 49 \n"
442 S = S + " vmovaps ymmword ptr [rdx], ymm1\n"
443 S = S + " vinserti128 ymm1, ymm2, xmm0, 1\n"
444 S = S + " vmovdqa ymmword ptr [rcx], ymm1\n"
445 S = S + " vperm2i128 ymm0, ymm2, ymm0, 49 \n"
446 S = S + " vmovdqa ymmword ptr [r8], ymm0\n"
447 S = S + " vmovdqa ymm0, ymmword ptr [rdi + 288]\n"
448 S = S + " vpsllq ymm0, ymm0, 32\n"
449 S = S + " vpxor ymm0, ymm0, ymmword ptr [rdi + 256]\n"
450 S = S + " vmovq xmm1, xmm0 \n"
451 S = S + " vmovdqa ymmword ptr [rsi + 32], ymm1\n"
452 S = S + " vpsrldq xmm1, xmm0, 8 \n"
453 S = S + " vmovdqa ymmword ptr [rdx + 32], ymm1\n"
454 S = S + " vextracti128 xmm1, ymm0, 1\n"
455 S = S + " vmovq xmm1, xmm1 \n"
456 S = S + " vmovdqa ymmword ptr [rcx + 32], ymm1\n"
457 S = S + " vpermq ymm0, ymm0, 255\n"
458 S = S + " vmovq xmm0, xmm0\n"
459 S = S + " vmovdqa ymmword ptr [r8 + 32], ymm0\n"
460 S = S + " vmovdqa ymm0, ymmword ptr [rdi + 352]\n"
461 S = S + " vmovdqa ymm1, ymmword ptr [rdi + 416]\n"
462 S = S + " vmovdqa ymm2, ymmword ptr [rdi + 480]\n"
463 S = S + " vpsllq ymm0, ymm0, 32\n"
464 S = S + " vpxor ymm0, ymm0, ymmword ptr [rdi + 320]\n"
465 S = S + " vmovdqa ymm3, ymmword ptr [rdi + 544]\n"
466 S = S + " vpsllq ymm1, ymm1, 32\n"
467 S = S + " vpxor ymm1, ymm1, ymmword ptr [rdi + 384]\n"
468 S = S + " vpsllq ymm2, ymm2, 32\n"
469 S = S + " vpxor ymm2, ymm2, ymmword ptr [rdi + 448]\n"
470 S = S + " vpsllq ymm3, ymm3, 32\n"
471 S = S + " vpxor ymm3, ymm3, ymmword ptr [rdi + 512]\n"
472 S = S + " vinserti128 ymm4, ymm0, xmm1, 1\n"
473 S = S + " vmovdqa ymmword ptr [rsi + 64], ymm4\n"
474 S = S + " vperm2i128 ymm0, ymm0, ymm1, 49 # ymm0 = ymm0[2,3],ymm1[2,3]\n"
475 S = S + " vmovdqa ymmword ptr [rdx + 64], ymm0\n"
476 S = S + " vinserti128 ymm0, ymm2, xmm3, 1\n"
477 S = S + " vmovdqa ymmword ptr [rcx + 64], ymm0\n"
478 S = S + " vperm2i128 ymm0, ymm2, ymm3, 49 # ymm0 = ymm2[2,3],ymm3[2,3]\n"
479 S = S + " vmovdqa ymmword ptr [r8 + 64], ymm0\n"
480 S = S + " vpermpd ymm1, ymmword ptr [rsi + 64], 216 # ymm1 = mem[0,2,1,3]\n"
481 S = S + " vpermq ymm2, ymmword ptr [rdx + 64], 216 # ymm2 = mem[0,2,1,3]\n"
482 S = S + " vpermpd ymm3, ymmword ptr [rcx + 64], 216 # ymm3 = mem[0,2,1,3]\n"
483 S = S + " vpermq ymm0, ymm0, 216\n"
484 S = S + " vinsertf128 ymm4, ymm1, xmm3, 1\n"
485 S = S + " vmovaps ymmword ptr [rsi + 64], ymm4\n"
486 S = S + " vperm2f128 ymm1, ymm1, ymm3, 49 # ymm1 = ymm1[2,3],ymm3[2,3]\n"
487 S = S + " vmovaps ymmword ptr [rdx + 64], ymm1\n"
488 S = S + " vinserti128 ymm1, ymm2, xmm0, 1\n"
489 S = S + " vmovdqa ymmword ptr [rcx + 64], ymm1\n"
490 S = S + " vperm2i128 ymm0, ymm2, ymm0, 49 # ymm0 = ymm2[2,3],ymm0[2,3]\n"
491 S = S + " vmovdqa ymmword ptr [r8 + 64], ymm0\n"
492 S = S + " vmovdqa ymm0, ymmword ptr [rdi + 608]\n"
493 S = S + " vpsllq ymm0, ymm0, 32\n"
494 S = S + " vpxor ymm0, ymm0, ymmword ptr [rdi + 576]\n"
495 S = S + " vmovq xmm1, xmm0 \n"
496 S = S + " vmovdqa ymmword ptr [rsi + 96], ymm1\n"
497 S = S + " vpsrldq xmm1, xmm0, 8\n"
498 S = S + " vmovdqa ymmword ptr [rdx + 96], ymm1\n"
499 S = S + " vextracti128 xmm1, ymm0, 1\n"
500 S = S + " vmovq xmm1, xmm1\n"
501 S = S + " vmovdqa ymmword ptr [rcx + 96], ymm1\n"
502 S = S + " vpermq ymm0, ymm0, 255\n"
503 S = S + " vmovq xmm0, xmm0\n"
504 S = S + " vmovdqa ymmword ptr [r8 + 96], ymm0\n"
505 S = S + " ret\n"
506 print(S)
507
508 return 0
509

◆ PrintInterleave_4x_10x10()

PrintInterleave_4x_10x10 ( )

Definition at line 370 of file AsmAVX2Codegenerator.py.

370def PrintInterleave_4x_10x10():
371
372 S = ""
373
374 S = S + ".global interleave_4x_10x10_avx2\n"
375 S = S + "interleave_4x_10x10_avx2:\n"
376 S = S + " vmovdqa ymm5, YMMWORD PTR [rdx]\n"
377 S = S + " vmovdqa ymm7, YMMWORD PTR [r8]\n"
378 S = S + " vperm2i128 ymm3, ymm5, YMMWORD PTR [rsi], 2\n"
379 S = S + " vperm2i128 ymm2, ymm7, YMMWORD PTR [rcx], 2\n"
380 S = S + " vperm2i128 ymm1, ymm5, YMMWORD PTR [rsi], 19\n"
381 S = S + " vperm2i128 ymm0, ymm7, YMMWORD PTR [rcx], 19\n"
382 S = S + " vpermq ymm3, ymm3, 216\n"
383 S = S + " vpermq ymm2, ymm2, 216\n"
384 S = S + " vpermq ymm1, ymm1, 216\n"
385 S = S + " vperm2i128 ymm4, ymm2, ymm3, 2\n"
386 S = S + " vpermq ymm0, ymm0, 216\n"
387 S = S + " vperm2i128 ymm2, ymm2, ymm3, 19\n"
388 S = S + " vmovdqa YMMWORD PTR [rdi], ymm4\n"
389 S = S + " vmovdqa YMMWORD PTR [rdi+32], ymm2\n"
390 S = S + " vperm2i128 ymm2, ymm0, ymm1, 2\n"
391 S = S + " vperm2i128 ymm0, ymm0, ymm1, 19\n"
392 S = S + " vmovdqa YMMWORD PTR [rdi+64], ymm2\n"
393 S = S + " vmovdqa YMMWORD PTR [rdi+96], ymm0\n"
394 S = S + " vmovdqa ymm1, YMMWORD PTR [rdx+32]\n"
395 S = S + " vmovdqa ymm0, YMMWORD PTR [r8+32]\n"
396 S = S + " vperm2i128 ymm1, ymm1, YMMWORD PTR [rsi+32], 2\n"
397 S = S + " vperm2i128 ymm0, ymm0, YMMWORD PTR [rcx+32], 2\n"
398 S = S + " vpermq ymm1, ymm1, 216\n"
399 S = S + " vpermq ymm0, ymm0, 216\n"
400 S = S + " vperm2i128 ymm0, ymm0, ymm1, 2\n"
401 S = S + " vmovdqa YMMWORD PTR [rdi+128], ymm0\n"
402 S = S + " ret\n"
403 print(S)
404
405 return 0
406
407

◆ PrintMult()

PrintMult ( plimbs)

Definition at line 601 of file AsmAVX2Codegenerator.py.

601def PrintMult(plimbs):
602
603
604 # registers reserved rdi, rsi, rdx
605 # rax, rbx = rcx, r8
606 registers = ["ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13"]
607 reg_locked = [0] * 14
608
609 # ymm14 = temp register t0
610 # ymm15 = current b
611
612 if(plimbs > len(registers)):
613 print("ERROR: Index out range")
614 exit()
615 state = registers[:plimbs]
616 #state = registers
617
618 S = ""
619 # S = ".intel_syntax noprefix\n\n"
620 # S = S + ".section .rodata\n\n"
621 # S = S + ".section .text\n\n"
622
623 S = S + ".global mult_avx2_"+ str(plimbs) + "x" + str(plimbs) + "\n"
624 S = S + "mult_avx2_"+ str(plimbs) + "x" + str(plimbs) + ":\n"
625
626 # load as much a's as possible
627 countAs = len(registers) - plimbs
628 for j in range(0, countAs):
629 S = S + " vmovdqa " + registers[j] + ", YMMWORD PTR [rsi + " + str(j*32) + "] \n"
630 reg_locked[j] = 1
631
632 # mult bi x a0...ai
633 for j in range(0,plimbs):
634 # print(j)
635 # print(reg_locked)
636 # S = S + " vmovdqu " + str(j*32) + "(%rdx), % \n"
637 S = S + " vmovdqa ymm15, YMMWORD PTR [rdx + " + str(j*32) + "] \n"
638 # mults
639 for k in range(0,plimbs):
640 # S = S + " vpmuldq %ymm15, %" + registers[k] + ", %" + registers[len(registers)-1 - ((j+k)%plimbs)] + " \n"
641 if(j>0):
642 if(reg_locked[len(registers)-1 - ((j+k)%plimbs)] == 1):
643 if(k >= countAs):
644 # read from stack
645 # S = S + " vpmuldq " + str(k*32) + "(%rsi), %ymm15, %ymm14 \n"
646 S = S + " vpmuldq ymm14, ymm15, YMMWORD PTR [rsi + " + str(k*32) + "]\n"
647 else:
648 #S = S + " vpmuldq %ymm15, %" + registers[k] + ", %ymm14 \n"
649 S = S + " vpmuldq ymm14, " + registers[k] + ", ymm15 \n"
650 if(0 in reg_locked):
651 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] + " \n"
652 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
653 else:
654 S = S + "################\n EXTRA LOAD 1\n"
655 else:
656 if(k >= countAs):
657 # read from stack
658 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm15, YMMWORD PTR [rsi + " + str(k*32) + "]\n"
659 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
660 else:
661 if(0 in reg_locked):
662 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm15, " + registers[k] + "\n"
663 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
664 else:
665 S = S + "################\n EXTRA LOAD 2\n"
666
667 else:
668 if(0 in reg_locked):
669 if(k >= countAs):
670 # read from stack
671 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm15, YMMWORD PTR [rsi + " + str(k*32) + "]\n"
672 else:
673 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] + " , ymm15, " + registers[k] + "\n"
674 reg_locked[len(registers)-1 - ((j+k)%plimbs)] = 1
675 else:
676 S = S + "################\n EXTRA LOAD 3\n"
677
678 S = S + " vmovdqu YMMWORD PTR [rdi + " + str(j*32) + "] , " + registers[len(registers)-1 - j] + "\n"
679 # S = S + " vpxor %" + registers[len(registers)-1 - j] + ", %" + registers[len(registers)-1 - j] + " , %" + registers[len(registers)-1 - j] + " \n"
680 reg_locked[len(registers)-1 - j] = 0
681 # S = S + " vpxor %" + registers[len(registers)-1 - j] + ", %" + registers[len(registers)-1 - j] + " , %" + registers[len(registers)-1 - j] + " \n"
682
683 # mult bi x a0...ai
684 for j in range(0,plimbs-1):
685 S = S + " vmovdqu YMMWORD PTR [rdi + " + str((plimbs + j)*32) + "], " + registers[len(registers)- j - 1] + "\n"
686
687 S = S + " ret\n"
688
689 print(S)
690
691
692 return 0
693

References if.

Referenced by Print_Assembly().

Here is the caller graph for this function:

◆ PrintMultInterleaved()

PrintMultInterleaved ( plimbs)

Definition at line 181 of file AsmAVX2Codegenerator.py.

181def PrintMultInterleaved(plimbs):
182
183 # registers reserved rdi, rsi, rdx
184 # rax, rbx = rcx, r8
185 registers = ["ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12"]
186 reg_locked = [0] * len(registers)
187 perm = False
188 count = 0
189
190 # ymm13 = temp shuffle
191 # ymm14 = temp register t0
192 # ymm15 = current b
193
194 if(plimbs > len(registers)):
195 print("ERROR: Index out range")
196 exit()
197 state = registers[:plimbs]
198 #state = registers
199
200 S = ""
201 # S = ".intel_syntax noprefix\n\n"
202 # S = S + ".section .rodata\n\n"
203 # S = S + ".section .text\n\n"
204
205 S = S + ".global mult_avx2_"+ str(plimbs) + "x" + str(plimbs) + "\n"
206 S = S + "mult_avx2_"+ str(plimbs) + "x" + str(plimbs) + ":\n"
207
208 # load as much a's as possible
209 countAs = min(len(registers) - plimbs, math.ceil(plimbs/2))
210 S = S + "############## load as much a's as possible\n"
211 for j in range(0, countAs):
212 S = S + " vmovdqa " + registers[j] + ", YMMWORD PTR [rsi + " + str(j*32) + "] \n"
213 reg_locked[j] = 1
214
215 for j in range(0,plimbs):
216 if(j%2==1):
217 S = S + "\n\n############## perm to b_" + str(j) + "\n"
218 S = S + " vpshufd ymm15, ymm15, 177\n"
219 perm = True
220 else:
221 S = S + "\n\n############## load b_" + str(j) + "\n"
222 S = S + " vmovdqa ymm15, YMMWORD PTR [rdx + " + str(j//2*32) + "] \n"
223 perm = False
224 if(j==0): # without adds = only for b0
225 S = S + "############## w/o adds\n"
226
227 for k in range(0,plimbs):
228 S = S + "############## a_" + str(k) + " * b_" + str(j) + "\n"
229 if(k<countAs*2):
230 S = S + "############## read from regs\n"
231 if(k%2==1):
232 # S = S + "############## perm to a_" + str(k) + "\n"
233 S = S + "############## perm " + registers[k//2] + " to a_" + str(k) + "\n"
234 S = S + " vpshufd " + registers[k//2] + ", " + registers[k//2] + ", 177\n"
235 else:
236 # S = S + "############## load a_" + str(k) + "\n"
237 S = S + "############## " + registers[k//2] + " = a_" + str(k) + "\n"
238
239 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +" , ymm15, " + registers[k//2] + "\n"
240 else:
241 S = S + "############## read from stack\n"
242 if(k%2==1):
243 S = S + "############## perm to a_" + str(k) + "\n"
244 S = S + " vpshufd ymm13, ymm13, 177\n"
245 else:
246 S = S + " vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) + "]\n"
247 S = S + "############## load a_" + str(k) + "\n"
248 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +", ymm15, ymm13\n"
249 S = S + " vmovdqu YMMWORD PTR [rdi + " + str(j*32) + "], " + registers[len(registers)-1 - j] + "\n"
250 S = S + " vxorps " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
251 # S = S + " vpxor " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
252 else: # with adds
253 S = S + "############## perm = " + str(perm) + "\n"
254 S = S + "############## w/ adds\n"
255 for k in range(0,plimbs):
256 if(k<countAs*2):
257 if(perm):
258 if(k%2==1):
259 S = S + "############## a_" + str(k-1) + " * b_" + str(j) + "\n"
260 else:
261 S = S + "############## a_" + str(k+1) + " * b_" + str(j) + "\n"
262 else:
263 S = S + "############## a_" + str(k) + " * b_" + str(j) + "\n"
264 S = S + "############## read from regs\n"
265 if(perm):
266 if(k%2==1):
267 S = S + "############## perm " + registers[k//2] + " to a_" + str(k-1) + "\n"
268 S = S + " vpshufd " + registers[k//2] + ", " + registers[k//2] + ", 177\n"
269 if(k==plimbs-2):
270 S = S + "############## save add\n"
271
272 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
273 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k-1)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k-1)%plimbs)] + " \n"
274 else:
275 if(k==plimbs-2):
276 S = S + "############## save add\n"
277 S = S + "############## " + registers[k//2] + " = a_" + str(k+1) + "\n"
278 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
279 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k+1)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k+1)%plimbs)] + " \n"
280 else:
281 if(k%2==1):
282 if(k==plimbs-2):
283 S = S + "############## save add\n"
284 S = S + "############## perm " + registers[k//2] + " to a_" + str(k) + "\n"
285 S = S + " vpshufd " + registers[k//2] + ", " + registers[k//2] + ", 177\n"
286 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
287 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] + " \n"
288 else:
289 S = S + "############## " + registers[k//2] + " = a_" + str(k) + "\n"
290 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
291 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] + " \n"
292
293
294 else:
295 S = S + "############## a_" + str(k) + " * b_" + str(j) + "\n"
296 S = S + "############## read from stack\n"
297 if(k%2==1):
298 S = S + "############## perm to a_" + str(k) + "\n"
299 S = S + " vpshufd ymm13, ymm13, 177\n"
300 else:
301
302 S = S + "############## load a_" + str(k) + "\n"
303 S = S + " vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) + "]\n"
304 if(k==plimbs-1):
305 S = S + "############## save add\n"
306 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm15, ymm13\n"
307 else:
308 S = S + " vpmuldq ymm14, ymm15, ymm13\n"
309 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +"\n"
310
311
312 S = S + " vmovdqu YMMWORD PTR [rdi + " + str(j*32) + "] , " + registers[len(registers)-1 - j] + "\n"
313 S = S + " vxorps " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
314 # S = S + " vpxor " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
315
316 S = S + "############## write all the rest\n"
317 for j in range(0,plimbs-1):
318 S = S + " vmovdqu YMMWORD PTR [rdi + " + str((plimbs + j)*32) + "], " + registers[len(registers)- j - 1] + "\n"
319
320 S = S + " ret\n"
321
322 print(S)
323
324 return 0
325
326

References if.

Referenced by main().

Here is the caller graph for this function:

◆ PrintSquaringInterleaved()

PrintSquaringInterleaved ( plimbs)

Definition at line 34 of file AsmAVX2Codegenerator.py.

34def PrintSquaringInterleaved(plimbs):
35
36 # registers reserved rdi, rsi, rdx
37 # rax, rbx = rcx, r8
38 registers = ["ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12"]
39 reg_locked = [0] * len(registers)
40 perm = False
41 count = 0
42
43 # ymm13 = temp shuffle
44 # ymm14 = temp register t0
45 # ymm15 = current b
46
47 if(plimbs > len(registers)):
48 print("ERROR: Index out range")
49 exit()
50 state = registers[:plimbs]
51 #state = registers
52
53 S = ""
54 # S = ".intel_syntax noprefix\n\n"
55 # S = S + ".section .rodata\n\n"
56 # S = S + ".section .text\n\n"
57
58 S = S + ".global mult_avx2_"+ str(plimbs) + "x" + str(plimbs) + "\n"
59 S = S + "mult_avx2_"+ str(plimbs) + "x" + str(plimbs) + ":\n"
60
61 # load as much a's as possible
62 countAs = min(len(registers) - plimbs, math.ceil(plimbs/2))
63 S = S + "############## load as much a's as possible\n"
64 for j in range(0, countAs):
65 S = S + " vmovdqa " + registers[j] + ", YMMWORD PTR [rsi + " + str(j*32) + "] \n"
66 reg_locked[j] = 1
67
68 for j in range(0,plimbs):
69 if(j%2==1):
70 S = S + "\n\n############## perm to b_" + str(j) + "\n"
71 S = S + " vpshufd ymm15, ymm15, 177\n"
72 perm = True
73 else:
74 S = S + "\n\n############## load b_" + str(j) + "\n"
75 S = S + " vmovdqa ymm15, YMMWORD PTR [rdx + " + str(j//2*32) + "] \n"
76 perm = False
77 if(j==0): # without adds = only for b0
78 S = S + "############## w/o adds\n"
79 for k in range(0,plimbs):
80 if(k==j):
81 S = S + "############## EQUAL\n"
82 S = S + "############## a_" + str(k) + " * b_" + str(j) + "\n"
83 if(k<countAs*2):
84 S = S + "############## read from regs\n"
85 if(k%2==1):
86 # S = S + "############## perm to a_" + str(k) + "\n"
87 S = S + "############## perm " + registers[k//2] + " to a_" + str(k) + "\n"
88 S = S + " vpshufd " + registers[k//2] + ", " + registers[k//2] + ", 177\n"
89 else:
90 # S = S + "############## load a_" + str(k) + "\n"
91 S = S + "############## " + registers[k//2] + " = a_" + str(k) + "\n"
92
93 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +" , ymm15, " + registers[k//2] + "\n"
94 else:
95 S = S + "############## read from stack\n"
96 if(k%2==1):
97 S = S + "############## perm to a_" + str(k) + "\n"
98 S = S + " vpshufd ymm13, ymm13, 177\n"
99 else:
100 S = S + " vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) + "]\n"
101 S = S + "############## load a_" + str(k) + "\n"
102 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] +", ymm15, ymm13\n"
103 S = S + " vmovdqu YMMWORD PTR [rdi + " + str(j*32) + "], " + registers[len(registers)-1 - j] + "\n"
104 S = S + " vxorps " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
105 # S = S + " vpxor " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
106 else: # with adds
107 S = S + "############## perm = " + str(perm) + "\n"
108 S = S + "############## w/ adds\n"
109 for k in range(0,plimbs):
110 if(k<countAs*2):
111 if(perm):
112 if(k%2==1):
113 S = S + "############## a_" + str(k-1) + " * b_" + str(j) + "\n"
114 else:
115 S = S + "############## a_" + str(k+1) + " * b_" + str(j) + "\n"
116 else:
117 S = S + "############## a_" + str(k) + " * b_" + str(j) + "\n"
118 S = S + "############## read from regs\n"
119 if(perm):
120 if(k%2==1):
121 S = S + "############## perm " + registers[k//2] + " to a_" + str(k-1) + "\n"
122 S = S + " vpshufd " + registers[k//2] + ", " + registers[k//2] + ", 177\n"
123 if(k==plimbs-2):
124 S = S + "############## save add\n"
125
126 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
127 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k-1)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k-1)%plimbs)] + " \n"
128 else:
129 if(k==plimbs-2):
130 S = S + "############## save add\n"
131 S = S + "############## " + registers[k//2] + " = a_" + str(k+1) + "\n"
132 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
133 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k+1)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k+1)%plimbs)] + " \n"
134 else:
135 if(k%2==1):
136 if(k==plimbs-2):
137 S = S + "############## save add\n"
138 S = S + "############## perm " + registers[k//2] + " to a_" + str(k) + "\n"
139 S = S + " vpshufd " + registers[k//2] + ", " + registers[k//2] + ", 177\n"
140 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
141 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] + " \n"
142 else:
143 S = S + "############## " + registers[k//2] + " = a_" + str(k) + "\n"
144 S = S + " vpmuldq ymm14, ymm15, " + registers[k//2] + "\n"
145 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] + " \n"
146
147
148 else:
149 S = S + "############## a_" + str(k) + " * b_" + str(j) + "\n"
150 S = S + "############## read from stack\n"
151 if(k%2==1):
152 S = S + "############## perm to a_" + str(k) + "\n"
153 S = S + " vpshufd ymm13, ymm13, 177\n"
154 else:
155
156 S = S + "############## load a_" + str(k) + "\n"
157 S = S + " vmovdqa ymm13, YMMWORD PTR [rsi + " + str((k//2)*32) + "]\n"
158 if(k==plimbs-1):
159 S = S + "############## save add\n"
160 S = S + " vpmuldq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm15, ymm13\n"
161 else:
162 S = S + " vpmuldq ymm14, ymm15, ymm13\n"
163 S = S + " vpaddq " + registers[len(registers)-1 - ((j+k)%plimbs)] + ", ymm14, " + registers[len(registers)-1 - ((j+k)%plimbs)] +"\n"
164
165
166 S = S + " vmovdqu YMMWORD PTR [rdi + " + str(j*32) + "] , " + registers[len(registers)-1 - j] + "\n"
167 S = S + " vxorps " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
168 # S = S + " vpxor " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + ", " + registers[len(registers)-1 - j] + "\n"
169
170 S = S + "############## write all the rest\n"
171 for j in range(0,plimbs-1):
172 S = S + " vmovdqu YMMWORD PTR [rdi + " + str((plimbs + j)*32) + "], " + registers[len(registers)- j - 1] + "\n"
173
174 S = S + " ret\n"
175
176 print(S)
177
178 return 0
179
180

References if.

◆ PrintSub()

PrintSub ( plimbs)

Definition at line 885 of file AsmAVX2Codegenerator.py.

885def PrintSub(plimbs):
886
887 registers = ["rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"]
888
889 S = ""
890 # S = ".intel_syntax noprefix\n\n"
891 # S = S + ".section .rodata\n\n"
892 # S = S + ".section .text\n\n"
893 S = S + ".global sub_d_"+ str(plimbs + 1) + "x" + str(plimbs + 1) + "_woc\n"
894 S = S + "sub_d_"+ str(plimbs + 1) + "x" + str(plimbs + 1) + "_woc:\n"
895
896 S = S + push()
897
898 plimbs += 1
899 limbs_counter = plimbs
900
901 reg_counter = 0
902 S = S + "# intro\n"
903 while(limbs_counter > 0) and reg_counter < len(registers):
904 S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
905 limbs_counter -= 1
906 reg_counter += 1
907
908 S = S + "# loop\n"
909 for i in range(0,((plimbs-1)//(len(registers)-1))):
910 # for i in range(0,math.ceil(plimbs/(len(registers)-1))-1):
911 if i > 0:
912 reg_counter = 0
913 S = S + "# ------------------ " + str(i) + "\n"
914 while(limbs_counter > 0) and reg_counter < len(registers)-1:
915 S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
916 limbs_counter -= 1
917 reg_counter += 1
918 registers = rotate(registers, 1)
919 else:
920 reg_counter -= 1
921 S = S + "# ------------------\n"
922 for j in range(0, reg_counter):
923 if(j == 0) and (i == 0):
924 S = S + " sub " + registers[j] + ", [rsi + " + str(j + (i*len(registers))) + "*8]\n"
925 S = S + " sbb " + registers[j+1] + ", 0\n"
926 S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*len(registers))) + "*8]\n"
927 else:
928 S = S + " sbb " + registers[j] + ", [rsi + " + str(j + (i*(len(registers)-1))) + "*8]\n"
929 S = S + " sbb " + registers[j+1] + ", 0\n"
930 S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*(len(registers)-1))) + "*8]\n"
931 S = S + "# ------------------\n"
932 for k in range(0, reg_counter):
933 S = S + " mov [rdi + " + str(k + (i*(len(registers)-1))) + "*8], " + registers[k] + "\n"
934
935 S = S + "# last loop\n"
936 if(1):
937 i = (plimbs-1)//(len(registers)-1)
938 # i = math.ceil(plimbs/(len(registers)-1)) -1
939 if i > 0:
940 reg_counter = 0
941 S = S + "# ------------------\n"
942 while(limbs_counter > 0) and reg_counter < len(registers)-1:
943 S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
944 limbs_counter -= 1
945 reg_counter += 1
946 registers = rotate(registers, 1)
947 else:
948 reg_counter -= 1
949 S = S + "# ------------------\n"
950 for j in range(0, reg_counter - 1):
951 if(j == 0) and (i == 0):
952 S = S + " sub " + registers[j] + ", [rsi + " + str(j + (i*len(registers))) + "*8]\n"
953 S = S + " sbb " + registers[j+1] + ", 0\n"
954 S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*len(registers))) + "*8]\n"
955 else:
956 S = S + " sbb " + registers[j] + ", [rsi + " + str(j + (i*(len(registers)-1))) + "*8]\n"
957 S = S + " sbb " + registers[j+1] + ", 0\n"
958 S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*(len(registers)-1))) + "*8]\n"
959 S = S + "# ------------------\n"
960 for k in range(0, reg_counter):
961 S = S + " mov [rdi + " + str(k + (i*(len(registers)-1))) + "*8], " + registers[k] + "\n"
962
963
964 # outro
965 S = S + "# outro\n"
966 S = S + " sbb " + registers[reg_counter - 1] + ", 0\n"
967 S = S + " sub " + registers[reg_counter - 1] + ", [rsi + " + str(reg_counter - 1 + (i*(len(registers)-1))) + "*8]\n"
968 S = S + " sbb " + registers[reg_counter] + ", 0\n"
969 S = S + " sub " + registers[reg_counter] + ", [rsi + " + str(reg_counter + (i*(len(registers)-1))) + "*8]\n"
970 S = S + " mov [rdi + " + str(reg_counter - 1 + (i*(len(registers)-1))) + "*8], " + registers[reg_counter - 1] + "\n"
971 S = S + " mov [rdi + " + str(reg_counter + (i*(len(registers)-1))) + "*8], " + registers[reg_counter] + "\n"
972
973 S = S + pop()
974
975 S = S + " ret\n"
976
977 print(S)
978 return 0
979
980#////////////////////////////////////////

References if, pop(), push(), and rotate().

Referenced by Print_Assembly().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ PrintSubAVX2()

PrintSubAVX2 ( plimbs)

Definition at line 349 of file AsmAVX2Codegenerator.py.

349def PrintSubAVX2(plimbs):
350
351 registers = ["ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13"]
352
353 S = ""
354 # add_2x_2x2_avx2
355 S = S + ".global sub_"+ str(plimbs) + "x" + str(plimbs) + "_avx2\n"
356
357 S = S + "sub_"+ str(plimbs) + "x" + str(plimbs) + "_avx2:\n"
358 for i in range(0, plimbs):
359 S = S + " vmovdqa " + registers[i%len(registers)] + ", ymmword ptr [rdi + " + str(i*32) + "]\n"
360 S = S + " vpsubd " + registers[i%len(registers)] + ", " + registers[i%len(registers)] + ", ymmword ptr [rsi + " + str(i*32) + "]\n"
361 S = S + " vpsubd " + registers[i%len(registers)] + ", " + registers[i%len(registers)] + ", ymmword ptr [rdx + " + str(i*32) + "]\n"
362 S = S + " vmovdqa ymmword ptr [rdi + " + str(i*32) + "], " + registers[i%len(registers)] + "\n"
363
364 S = S + " ret\n"
365
366 print(S)
367
368 return 0
369

◆ PrintSubOdd()

PrintSubOdd ( plimbs)

Definition at line 821 of file AsmAVX2Codegenerator.py.

821def PrintSubOdd(plimbs):
822
823
824 registers = ["rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"]
825
826 S = ""
827 # S = ".intel_syntax noprefix\n\n"
828 # S = S + ".section .rodata\n\n"
829 # S = S + ".section .text\n\n"
830 S = S + ".global sub_d_"+ str(plimbs + 2) + "x" + str(plimbs) + "_woc\n"
831 S = S + "sub_d_"+ str(plimbs + 2) + "x" + str(plimbs) + "_woc:\n"
832
833 S = S + push()
834
835 plimbs += 1
836 limbs_counter = plimbs
837
838 reg_counter = 0
839 S = S + "# intro\n"
840 while(limbs_counter > 0) and reg_counter < len(registers):
841 S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
842 limbs_counter -= 1
843 reg_counter += 1
844
845 S = S + "# loop\n"
846 for i in range(0,math.ceil(plimbs/(len(registers)-1))):
847 if i > 0:
848 reg_counter = 0
849 S = S + "# ------------------\n"
850 while(limbs_counter > 0) and reg_counter < len(registers)-1:
851 S = S + " mov " + registers[reg_counter] + ", [rdi + " + str(plimbs - limbs_counter) + "*8]\n"
852 limbs_counter -= 1
853 reg_counter += 1
854 registers = rotate(registers, 1)
855 else:
856 reg_counter -= 1
857 S = S + "# ------------------\n"
858 for j in range(0, reg_counter):
859 if(j == 0) and (i == 0):
860 S = S + " sub " + registers[j] + ", [rsi + " + str(j + (i*len(registers))) + "*8]\n"
861 S = S + " sbb " + registers[j+1] + ", 0\n"
862 S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*len(registers))) + "*8]\n"
863 else:
864 S = S + " sbb " + registers[j] + ", [rsi + " + str(j + (i*(len(registers)-1))) + "*8]\n"
865 S = S + " sbb " + registers[j+1] + ", 0\n"
866 S = S + " sub " + registers[j] + ", [rdx + " + str(j + (i*(len(registers)-1))) + "*8]\n"
867 S = S + "# ------------------\n"
868 for k in range(0, reg_counter):
869 S = S + " mov [rdi + " + str(k + (i*(len(registers)-1))) + "*8], " + registers[k] + "\n"
870
871 # outro
872 S = S + "# outro\n"
873 S = S + " sbb " + registers[reg_counter] + ", 0\n"
874 S = S + " mov [rdi + " + str(reg_counter + (i*(len(registers)-1))) + "*8], " + registers[reg_counter] + "\n"
875
876 S = S + pop()
877
878 S = S + " ret\n"
879
880
881
882 print(S)
883 return 0
884

References if, pop(), push(), and rotate().

Referenced by Print_Assembly().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ push()

push ( )

Definition at line 17 of file AsmAVX2Codegenerator.py.

17def push():
18 S = "# -------------------\n"
19 S = S + "# push\n"
20 # S = S + " push rbx\n push rbp\n push rdi\n push rsi\n push r12\n push r13\n push r14\n push r15\n\n"
21 S = S + " push rbx\n push rbp\n push rsi\n push r12\n push r13\n push r14\n push r15\n\n"
22 # S = S + " push rdx\n push rdi\n push rsi\n\n"
23 return S
24

Referenced by PrintSub(), and PrintSubOdd().

Here is the caller graph for this function:

◆ rotate()

rotate ( l,
x )

Definition at line 14 of file AsmAVX2Codegenerator.py.

14def rotate(l, x):
15 return l[-x:] + l[:-x]
16

Referenced by PrintSub(), and PrintSubOdd().

Here is the caller graph for this function:

Variable Documentation

◆ printOut

int printOut = 0

Definition at line 12 of file AsmAVX2Codegenerator.py.

◆ sefOfLimbs

sefOfLimbs = set()

Definition at line 11 of file AsmAVX2Codegenerator.py.