8#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
9#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
10#define int32x8_min _mm256_min_epi32
11#define int32x8_max _mm256_max_epi32
13#define int32x8_MINMAX(a,b) \
15 int32x8 c = int32x8_min(a,b); \
16 b = int32x8_max(a,b); \
21static void minmax_vector(
int32 *x,
int32 *y,
long long n)
60 b0 = _mm256_permute2x128_si256(x0,x1,0x20);
61 b1 = _mm256_permute2x128_si256(x0,x1,0x31);
65 c0 = _mm256_unpacklo_epi64(b0,b1);
66 c1 = _mm256_unpackhi_epi64(b0,b1);
70 b0 = _mm256_unpacklo_epi32(c0,c1);
71 b1 = _mm256_unpackhi_epi32(c0,c1);
73 c0 = _mm256_unpacklo_epi64(b0,b1);
74 c1 = _mm256_unpackhi_epi64(b0,b1);
78 b0 = _mm256_unpacklo_epi32(c0,c1);
79 b1 = _mm256_unpackhi_epi32(c0,c1);
81 x0 = _mm256_permute2x128_si256(b0,b1,0x20);
82 x1 = _mm256_permute2x128_si256(b0,b1,0x31);
85 mask = _mm256_set1_epi32(-1);
96static void int32_twostages_32(
int32 *x,
long long n)
101 for (
i = 0;
i < 32;
i += 8) {
124static long long int32_threestages(
int32 *x,
long long n,
long long q)
128 for (k = 0;k + 8*q <=
n;k += 8*q)
129 for (
i = k;
i < k + q;
i += 8) {
167static void int32_sort_2power(
int32 *x,
long long n,
int flagdown)
168{
long long p,q,
i,
j,k;
224 mask = _mm256_set_epi32(0,0,-1,-1,0,0,-1,-1);
229 b0 = _mm256_unpacklo_epi32(x0,x1);
230 b1 = _mm256_unpackhi_epi32(x0,x1);
232 c0 = _mm256_unpacklo_epi64(b0,b1);
233 c1 = _mm256_unpackhi_epi64(b0,b1);
237 mask = _mm256_set_epi32(0,0,-1,-1,-1,-1,0,0);
241 b0 = _mm256_unpacklo_epi32(c0,c1);
242 b1 = _mm256_unpackhi_epi32(c0,c1);
246 x0 = _mm256_unpacklo_epi64(b0,b1);
247 x1 = _mm256_unpackhi_epi64(b0,b1);
249 b0 = _mm256_unpacklo_epi32(x0,x1);
250 b1 = _mm256_unpackhi_epi32(x0,x1);
252 c0 = _mm256_unpacklo_epi64(b0,b1);
253 c1 = _mm256_unpackhi_epi64(b0,b1);
257 b0 = _mm256_unpacklo_epi32(c0,c1);
258 b1 = _mm256_unpackhi_epi32(c0,c1);
263 c0 = _mm256_permute2x128_si256(b0,b1,0x20);
264 c1 = _mm256_permute2x128_si256(b0,b1,0x31);
268 b0 = _mm256_permute2x128_si256(c0,c1,0x20);
269 b1 = _mm256_permute2x128_si256(c0,c1,0x31);
273 x0 = _mm256_unpacklo_epi64(b0,b1);
274 x1 = _mm256_unpackhi_epi64(b0,b1);
276 b0 = _mm256_unpacklo_epi32(x0,x1);
277 b1 = _mm256_unpackhi_epi32(x0,x1);
279 c0 = _mm256_unpacklo_epi64(b0,b1);
280 c1 = _mm256_unpackhi_epi64(b0,b1);
284 b0 = _mm256_unpacklo_epi32(c0,c1);
285 b1 = _mm256_unpackhi_epi32(c0,c1);
287 x0 = _mm256_unpacklo_epi64(b0,b1);
288 x1 = _mm256_unpackhi_epi64(b0,b1);
290 mask = _mm256_set1_epi32(-1);
291 if (flagdown) x1 ^= mask;
294 merge16_finish(x,x0,x1,flagdown);
301 int32_sort_2power(x,16,1);
302 int32_sort_2power(x + 16,16,0);
310 mask = _mm256_set1_epi32(-1);
320 merge16_finish(x,x0,x1,flagdown);
321 merge16_finish(x + 16,x2,x3,flagdown);
326 for (
i = 0;
i <
p;
i += 8) {
365 mask = _mm256_set1_epi32(-1);
367 for (
j = 0;
j <
n;
j += 32) {
380 int32_threestages(x,n,q >> 2);
384 int32_twostages_32(x,n);
389 for (k = 0;k <
n;k += 8*q)
390 for (
i = k;
i < k + q;
i += 8) {
426 for (k = 0;k <
n;k += 4*q)
427 for (
i = k;
i < k + q;
i += 8) {
446 for (k = 0;k <
n;k += q + q) {
459 for (
j = 0;
j < q;
j +=
p +
p) {
460 for (k =
j;k <
j +
p +
p;k +=
p) {
461 for (
i = k;
i < k +
p;
i += 8) {
509 if (
p<<4 == n)
break;
514 for (
p = 4;
p >= 1;
p >>= 1) {
518 mask = _mm256_set_epi32(0,0,0,0,-1,-1,-1,-1);
519 while (z != target) {
529 mask = _mm256_set_epi32(0,0,-1,-1,-1,-1,0,0);
530 while (z != target) {
535 int32x8 b0 = _mm256_permute2x128_si256(x0,x1,0x20);
536 int32x8 b1 = _mm256_permute2x128_si256(x0,x1,0x31);
538 int32x8 c0 = _mm256_permute2x128_si256(b0,b1,0x20);
539 int32x8 c1 = _mm256_permute2x128_si256(b0,b1,0x31);
545 mask = _mm256_set_epi32(0,-1,-1,0,0,-1,-1,0);
546 while (z != target) {
551 int32x8 b0 = _mm256_permute2x128_si256(x0,x1,0x20);
552 int32x8 b1 = _mm256_permute2x128_si256(x0,x1,0x31);
553 int32x8 c0 = _mm256_unpacklo_epi64(b0,b1);
554 int32x8 c1 = _mm256_unpackhi_epi64(b0,b1);
556 int32x8 d0 = _mm256_unpacklo_epi64(c0,c1);
557 int32x8 d1 = _mm256_unpackhi_epi64(c0,c1);
559 int32x8 e0 = _mm256_permute2x128_si256(d0,d1,0x20);
560 int32x8 e1 = _mm256_permute2x128_si256(d0,d1,0x31);
568 while (q >= 128 || q == 32) {
569 int32_threestages(x,n,q>>2);
574 for (
j = 0;
j <
n;
j += 4*q)
575 for (k =
j;k <
j + q;k += 8) {
594 for (
j = 0;
j <
n;
j += 2*q) {
606 for (k = 0;k < q;k += 8) {
641 mask = _mm256_set1_epi32(-1);
643 for (
i = 0;
i <
n;
i += 64) {
653 int32x8 b0 = _mm256_unpacklo_epi32(a0,
a1);
654 int32x8 b1 = _mm256_unpackhi_epi32(a0,
a1);
655 int32x8 b2 = _mm256_unpacklo_epi32(a2,
a3);
656 int32x8 b3 = _mm256_unpackhi_epi32(a2,
a3);
657 int32x8 b4 = _mm256_unpacklo_epi32(a4,a5);
658 int32x8 b5 = _mm256_unpackhi_epi32(a4,a5);
659 int32x8 b6 = _mm256_unpacklo_epi32(a6,a7);
660 int32x8 b7 = _mm256_unpackhi_epi32(a6,a7);
662 int32x8 c0 = _mm256_unpacklo_epi64(b0,b2);
663 int32x8 c1 = _mm256_unpacklo_epi64(b1,b3);
664 int32x8 c2 = _mm256_unpackhi_epi64(b0,b2);
665 int32x8 c3 = _mm256_unpackhi_epi64(b1,b3);
667 int32x8 c5 = _mm256_unpacklo_epi64(b5,b7);
669 int32x8 c7 = _mm256_unpackhi_epi64(b5,b7);
683 int32x8 d0 = _mm256_permute2x128_si256(c0,
c4,0x20);
684 int32x8 d1 = _mm256_permute2x128_si256(c2,
c6,0x20);
685 int32x8 d2 = _mm256_permute2x128_si256(c1,c5,0x20);
686 int32x8 d3 = _mm256_permute2x128_si256(c3,c7,0x20);
687 int32x8 d4 = _mm256_permute2x128_si256(c0,
c4,0x31);
688 int32x8 d5 = _mm256_permute2x128_si256(c2,
c6,0x31);
689 int32x8 d6 = _mm256_permute2x128_si256(c1,c5,0x31);
690 int32x8 d7 = _mm256_permute2x128_si256(c3,c7,0x31);
705 int32x8 e0 = _mm256_unpacklo_epi32(d0,d1);
706 int32x8 e1 = _mm256_unpackhi_epi32(d0,d1);
707 int32x8 e2 = _mm256_unpacklo_epi32(d2,d3);
708 int32x8 e3 = _mm256_unpackhi_epi32(d2,d3);
709 int32x8 e4 = _mm256_unpacklo_epi32(d4,d5);
710 int32x8 e5 = _mm256_unpackhi_epi32(d4,d5);
711 int32x8 e6 = _mm256_unpacklo_epi32(d6,d7);
712 int32x8 e7 = _mm256_unpackhi_epi32(d6,d7);
714 int32x8 f0 = _mm256_unpacklo_epi64(e0,e2);
715 int32x8 f1 = _mm256_unpacklo_epi64(e1,e3);
716 int32x8 f2 = _mm256_unpackhi_epi64(e0,e2);
717 int32x8 f3 = _mm256_unpackhi_epi64(e1,e3);
718 int32x8 f4 = _mm256_unpacklo_epi64(e4,e6);
719 int32x8 f5 = _mm256_unpacklo_epi64(e5,e7);
720 int32x8 f6 = _mm256_unpackhi_epi64(e4,e6);
721 int32x8 f7 = _mm256_unpackhi_epi64(e5,e7);
723 int32x8 g0 = _mm256_permute2x128_si256(f0,f4,0x20);
724 int32x8 g1 = _mm256_permute2x128_si256(f2,f6,0x20);
725 int32x8 g2 = _mm256_permute2x128_si256(f1,f5,0x20);
726 int32x8 g3 = _mm256_permute2x128_si256(f3,f7,0x20);
727 int32x8 g4 = _mm256_permute2x128_si256(f0,f4,0x31);
728 int32x8 g5 = _mm256_permute2x128_si256(f2,f6,0x31);
729 int32x8 g6 = _mm256_permute2x128_si256(f1,f5,0x31);
730 int32x8 g7 = _mm256_permute2x128_si256(f3,f7,0x31);
743 while (q >= 128 || q == 32) {
745 for (
j = 0;
j <
n;
j += 8*q)
746 for (
i =
j;
i <
j + q;
i += 8) {
780 for (
j = 0;
j <
n;
j += 4*q)
781 for (
i =
j;
i <
j + q;
i += 8) {
798 for (
j = 0;
j <
n;
j += q + q) {
807 for (
i = 0;
i < q;
i += 8) {
830 int32x8 b0 = _mm256_unpacklo_epi32(x0,x4);
831 int32x8 b1 = _mm256_unpackhi_epi32(x0,x4);
832 int32x8 b2 = _mm256_unpacklo_epi32(x1,x5);
833 int32x8 b3 = _mm256_unpackhi_epi32(x1,x5);
834 int32x8 b4 = _mm256_unpacklo_epi32(x2,x6);
835 int32x8 b5 = _mm256_unpackhi_epi32(x2,x6);
836 int32x8 b6 = _mm256_unpacklo_epi32(x3,x7);
837 int32x8 b7 = _mm256_unpackhi_epi32(x3,x7);
839 int32x8 c0 = _mm256_unpacklo_epi64(b0,
b4);
840 int32x8 c1 = _mm256_unpacklo_epi64(b1,b5);
841 int32x8 c2 = _mm256_unpackhi_epi64(b0,
b4);
842 int32x8 c3 = _mm256_unpackhi_epi64(b1,b5);
843 int32x8 c4 = _mm256_unpacklo_epi64(b2,b6);
844 int32x8 c5 = _mm256_unpacklo_epi64(b3,b7);
845 int32x8 c6 = _mm256_unpackhi_epi64(b2,b6);
846 int32x8 c7 = _mm256_unpackhi_epi64(b3,b7);
848 int32x8 d0 = _mm256_permute2x128_si256(c0,
c4,0x20);
849 int32x8 d1 = _mm256_permute2x128_si256(c1,c5,0x20);
850 int32x8 d2 = _mm256_permute2x128_si256(c2,
c6,0x20);
851 int32x8 d3 = _mm256_permute2x128_si256(c3,c7,0x20);
852 int32x8 d4 = _mm256_permute2x128_si256(c0,
c4,0x31);
853 int32x8 d5 = _mm256_permute2x128_si256(c1,c5,0x31);
854 int32x8 d6 = _mm256_permute2x128_si256(c2,
c6,0x31);
855 int32x8 d7 = _mm256_permute2x128_si256(c3,c7,0x31);
928 if (!(n & (n - 1))) {
929 int32_sort_2power(x,n,0);
934 while (q < n - q) q += q;
954 for (
i =
j;
i <
j +
q;
i += 8) {
972 for (
i =
j;
i <
j +
q;
i += 8) {
986 for (;
j + 64 <= n;
j += 64) {
1082 for (;
j + 32 <= n;
j += 32) {
1132 for (;
j + 16 <= n;
j += 16) {
#define int32_MINMAX(a, b)
#define int32x8_MINMAX(a, b)
__attribute__((noinline))
#define int32x8_store(z, i)
function j_invariant(a1, a3, a2, a4, a6) b2 b4