110void xMUL_dac(
proj *Q,
proj const *A24,
int Aaffine,
proj const *P, int64_t dac, int64_t daclen, int64_t maxdaclen)
115 xDBL(&P2, &P1, A24, Aaffine);
117 xADD(&P3, &P2, &P1, &P1);
119 int64_t collision = fp_iszero(Pinput.
z);
123 int64_t want = 1 + int64mask_negative(daclen);
124 proj_cmov(Q, &P3, want);
132 proj_cswap(&P1, &P2, 1 - (dac & 1));
137 collision |= want & fp_iszero(P2.
z);
140 xADD(&next, &P3, &P1, &P2);
149 proj_cmov(Q, &Pinput, collision);
160 const proj Pcopy = *P;
173 int64_t bitxor = bit ^ prevbit;
175 proj_cswap(Q, &R, bitxor);
177 xDBLADD(Q, &R, Q, &R, &Pcopy, &A24, Aaffine);
181 proj_cswap(Q, &R, prevbit);
435 int64_t sqrtvelu = 0;
439 steps(&bs, &gs, klower);
448 fp tmp0, tmp1, tmp2, tmp3, tmp4;
455 fp_double2(&Aed.
z, (
const fp *)&A->z);
458 fp_add3(&Aed.
x, (
const fp *)&A->x, (
const fp *)&Aed.
z);
464 fp_double2(&A24.
z, (
const fp *)&Aed.
z);
467 fp_sub3(&Aed.
z, (
const fp *)&A->x, (
const fp *)&Aed.
z);
473 for (int64_t s = 0; s < kupper; ++s)
479 xDBL(&M[2], K, &A24, 0);
486 for (int64_t s = 3; s < kupper; ++s)
498 xADD(&M[s], &M[2], &M[1], &M[1]);
507 xADD(&M[s], &M[s - 2], &M[2], &M[s - 4]);
516 int64_t
i = s / 2 - 1;
518 if (
i < (kupper - 1) / 2 - 2 * bs * gs)
523 xDBL(&M[s], &M[2], &A24, 0);
532 xADD(&M[s], &M[s - 2], &M[2], &M[s - 4]);
547 xADD(&M[s], &M[bs + 1], &M[bs - 1], &M[2]);
553 else if (s == 4 * bs)
556 xDBL(&M[s], &M[2 * bs], &A24, 0);
562 else if (s == 6 * bs)
566 xADD(&M[s], &M[4 * bs], &M[2 * bs], &M[2 * bs]);
572 else if (s % (4 * bs) == 2 * bs)
574 int64_t
j = s / (4 * bs);
575 assert(s == 2 * bs * (2 *
j + 1));
578 assert(Minit[s - 4 * bs]);
579 assert(Minit[s - 8 * bs]);
581 xADD(&M[s], &M[s - 4 * bs], &M[4 * bs], &M[s - 8 * bs]);
593 for (int64_t
i = 3;
i <= (kupper - 1) / 2; ++
i)
598 xADD(&M[
i], &M[
i - 1], K, &M[
i - 2]);
611 proj Qbatch[fixPlen];
612 for (int64_t h = 0; h < Plen; ++h)
617 fp Psum[fixPlen], Pdif[fixPlen];
618 for (int64_t h = 0; h < Plen; ++h)
622 fp_add3(&Psum[h], (
const fp *)&P[h].x, (
const fp *)&P[h].z);
624 fp_sub3(&Pdif[h], (
const fp *)&P[h].x, (
const fp *)&P[h].z);
632 for (int64_t
i = 0;
i < bs; ++
i)
635 fp_neg2(&TI[2 *
i], (
const fp *)&M[2 *
i + 1].x);
645 for (int64_t
j = 0;
j < gs; ++
j)
647 assert(Minit[2 * bs * (2 *
j + 1)]);
648 biquad_precompute_curve(Aprecomp[
j], &M[2 * bs * (2 *
j + 1)], A);
649 biquad_postcompute_curve(T1 + 3 *
j, Tminus1 + 3 *
j, (
const fp *)Aprecomp[
j]);
655 fp precomp[precompsize];
662 for (int64_t
i = 1;
i < bs; ++
i)
663 fp_mul2(&Abatch.
x, (
const fp *)&v[
i]);
667 for (int64_t
i = 1;
i < bs; ++
i)
668 fp_mul2(&Abatch.
z, (
const fp *)&v[
i]);
670 for (int64_t h = 0; h < Plen; ++h)
673 biquad_precompute_point(Pprecomp, &P[h]);
676 for (int64_t
j = 0;
j < gs; ++
j)
677 biquad_postcompute_point(TP + 3 *
j, (
const fp *)Pprecomp, (
const fp *)Aprecomp[
j]);
680 fp TPinv[2 * gs + 1];
681 for (int64_t
j = 0;
j < 2 * gs + 1; ++
j)
686 for (int64_t
i = 1;
i < bs; ++
i)
687 fp_mul2(&Qbatch[h].z, (
const fp *)&v[
i]);
691 for (int64_t
i = 1;
i < bs; ++
i)
692 fp_mul2(&Qbatch[h].x, (
const fp *)&v[
i]);
695 int64_t ignore = (k - 1) / 2 - 2 * bs * gs;
696 for (int64_t
i = 0;
i < (kupper - 1) / 2 - 2 * bs * gs; ++
i)
698 int64_t want = -((
i - ignore) >> 61);
700 fp_sub3(&tmp4, (
const fp *)&M[2 *
i + 2].x, (
const fp *)&M[2 *
i + 2].z);
701 fp_add3(&tmp3, (
const fp *)&M[2 *
i + 2].x, (
const fp *)&M[2 *
i + 2].z);
702 fp_mul3(&tmp2, (
const fp *)&Abatch.
x, (
const fp *)&tmp4);
703 fp_cmov_ctidh(&Abatch.
x, (
const fp *)&tmp2, want);
704 fp_mul3(&tmp2, (
const fp *)&Abatch.
z, (
const fp *)&tmp3);
705 fp_cmov_ctidh(&Abatch.
z, (
const fp *)&tmp2, want);
706 for (int64_t h = 0; h < Plen; ++h)
708 fp_mul3(&tmp1, (
const fp *)&tmp4, (
const fp *)&Psum[h]);
709 fp_mul3(&tmp0, (
const fp *)&tmp3, (
const fp *)&Pdif[h]);
710 fp_add3(&tmp2, (
const fp *)&tmp0, (
const fp *)&tmp1);
711 fp_mul2(&tmp2, (
const fp *)&Qbatch[h].x);
712 fp_cmov_ctidh(&Qbatch[h].x, (
const fp *)&tmp2, want);
713 fp_sub3(&tmp2, (
const fp *)&tmp0, (
const fp *)&tmp1);
714 fp_mul2(&tmp2, (
const fp *)&Qbatch[h].z);
715 fp_cmov_ctidh(&Qbatch[h].z, (
const fp *)&tmp2, want);
722 int64_t ignore = (k + 1) / 2;
725 fp_sub3(&tmp4, (
const fp *)&M[1].x, (
const fp *)&M[1].z);
726 fp_add3(&tmp3, (
const fp *)&M[1].x, (
const fp *)&M[1].z);
730 for (int64_t h = 0; h < Plen; ++h)
732 fp_mul3(&tmp1, (
const fp *)&tmp4, (
const fp *)&Psum[h]);
733 fp_mul3(&tmp0, (
const fp *)&tmp3, (
const fp *)&Pdif[h]);
734 fp_add3(&Qbatch[h].x, (
const fp *)&tmp0, (
const fp *)&tmp1);
735 fp_sub3(&Qbatch[h].z, (
const fp *)&tmp0, (
const fp *)&tmp1);
738 for (int64_t
i = 2;
i <= (kupper - 1) / 2; ++
i)
740 int64_t want = -((
i - ignore) >> 61);
743 fp_sub3(&tmp4, (
const fp *)&M[
i].x, (
const fp *)&M[
i].z);
744 fp_add3(&tmp3, (
const fp *)&M[
i].x, (
const fp *)&M[
i].z);
745 fp_mul3(&tmp2, (
const fp *)&Abatch.
x, (
const fp *)&tmp4);
746 fp_cmov_ctidh(&Abatch.
x, (
const fp *)&tmp2, want);
747 fp_mul3(&tmp2, (
const fp *)&Abatch.
z, (
const fp *)&tmp3);
748 fp_cmov_ctidh(&Abatch.
z, (
const fp *)&tmp2, want);
749 for (int64_t h = 0; h < Plen; ++h)
752 fp_mul3(&tmp1, (
const fp *)&tmp4, (
const fp *)&Psum[h]);
753 fp_mul3(&tmp0, (
const fp *)&tmp3, (
const fp *)&Pdif[h]);
754 fp_add3(&tmp2, (
const fp *)&tmp0, (
const fp *)&tmp1);
755 fp_mul2(&tmp2, (
const fp *)&Qbatch[h].x);
756 fp_cmov_ctidh(&Qbatch[h].x, (
const fp *)&tmp2, want);
757 fp_sub3(&tmp2, (
const fp *)&tmp0, (
const fp *)&tmp1);
758 fp_mul2(&tmp2, (
const fp *)&Qbatch[h].z);
759 fp_cmov_ctidh(&Qbatch[h].z, (
const fp *)&tmp2, want);
764 for (int64_t h = 0; h < Plen; ++h)
768 fp_sq1(&Qbatch[h].x);
769 fp_sq1(&Qbatch[h].z);
773 fp_mul2(&P[h].x, (
const fp *)&Qbatch[h].x);
774 fp_mul2(&P[h].z, (
const fp *)&Qbatch[h].z);
778 powpow8mod(&Aed.
x, (
const fp *)&Abatch.
z, k, kupper);
780 powpow8mod(&Aed.
z, (
const fp *)&Abatch.
x, k, kupper);
784 fp_add3(&A->x, (
const fp *)&Aed.
x, (
const fp *)&Aed.
z);
785 fp_sub3(&A->z, (
const fp *)&Aed.
x, (
const fp *)&Aed.
z);