n_ll_is_prime trial division

David Sparks

unread,

Apr 5, 2026, 11:20:52 AMApr 5

to flint-devel, Fredrik Johansson, David Sparks

The code comment says
On Zen 3, unrolling by two and combining comparisons bitwise
is faster than a more branchy version.

Here are a few, progressively faster, different ways of doing the
comparison (excerpted into a standalone test function to make the
generated code easier to read):

#include <stdbool.h>
#include <stdint.h>
typedef uint64_t ulong;

extern ulong t[][4]; // The ll_trial_primes array
bool cmp1(size_t i, ulong a1, ulong a0, ulong b1, ulong b0)
{
ulong v0 = t[i][2];
ulong v1 = t[i][3];

ulong w0 = t[i+1][2];
ulong w1 = t[i+1][3];

return (a1 < v1) | ((a1 == v1) & (a0 < v0)) |
(b1 < w1) | ((b1 == w1) & (b0 < w0));
}

bool cmp2(size_t i, ulong a1, ulong a0, ulong b1, ulong b0)
{
ulong v0 = t[i][2];
ulong v1 = t[i][3];

ulong w0 = t[i+1][2];
ulong w1 = t[i+1][3];

return (a1 < v1 + (a0 < v0)) |
(b1 < w1 + (b0 < w0));
}

#ifdef __SIZEOF_INT128__
bool cmp3(size_t i, ulong a1, ulong a0, ulong b1, ulong b0)
{
unsigned __int128 a = (unsigned __int128)a1 << 64 | a0;
unsigned __int128 b = (unsigned __int128)b1 << 64 | b0;
unsigned __int128 v = (unsigned __int128)t[i][3] << 64 | t[i][2];
unsigned __int128 w = (unsigned __int128)t[i+1][3] << 64 | t[i+1][2];

return (a < v) ! (b < w);
}
#endif

GCC 15.2.0 compiles these to (boilerplate trimmed):

.file "cmp.c"
.text
.globl cmp1
cmp1:
pushq %rbx
movq .refptr.t(%rip), %rax
movq %rcx, %r10
addq $1, %rcx
movq %r8, %rbx
movq %rdx, %r11
salq $5, %r10
salq $5, %rcx
addq %rax, %r10
leaq (%rax,%rcx), %rdx
movq 24(%r10), %r8
cmpq 16(%r10), %rbx
movq 24(%rdx), %rcx
setb %al
movq 16(%rdx), %rbx
cmpq %r11, %r8
sete %r10b
andl %r10d, %eax
cmpq %rcx, %r9
setb %r10b
orl %r10d, %eax
cmpq %r8, %r11
setb %r8b
orl %r8d, %eax
cmpq %rbx, 48(%rsp)
setb %dl
cmpq %r9, %rcx
sete %cl
andl %ecx, %edx
orl %edx, %eax
popq %rbx
ret
.globl cmp2
cmp2:
movq %rdx, %r11
movq .refptr.t(%rip), %rdx
leaq 1(%rcx), %r10
salq $5, %rcx
addq %rdx, %rcx
movq 24(%rcx), %rax
cmpq 16(%rcx), %r8
adcq $0, %rax
cmpq %rax, %r11
setb %al
salq $5, %r10
addq %r10, %rdx
movq 16(%rdx), %rcx
movq 24(%rdx), %rdx
cmpq %rcx, 40(%rsp)
adcq $0, %rdx
cmpq %rdx, %r9
setb %dl
orl %edx, %eax
ret
.globl cmp3
cmp3:
movq .refptr.t(%rip), %r10
leaq 1(%rcx), %r11
salq $5, %rcx
addq %r10, %rcx
cmpq 16(%rcx), %r8
sbbq 24(%rcx), %rdx
setc %al
salq $5, %r11
addq %r11, %r10
movq 16(%r10), %rdx
cmpq %rdx, 40(%rsp)
sbbq 24(%r10), %r9
setc %dl
orl %edx, %eax
ret

cmp2 works because we know that v1 and w1 are not -1, so we
can add a boolean to them without risk of overflow.

But cmp3 is yet faster, and there's a preprocessor symbol that
it can be conditioned on.

David Sparks

unread,

Apr 5, 2026, 6:59:16 PMApr 5

to Edgar Costa, flint...@googlegroups.com, Fredrik Johansson

Thank you for the measurements! One mistake I made was to
write my examples as "product < limit", when the original
code, and what we want, is "product <= limit". With that
tweak to benchmark.c (and the RNG improved; rand() only
return 31 bits so "rand() << 32 | rand()" only gives
you 62 bits), my i5-4570 gives:

Initializing data...
Running benchmark (10000000 iterations)...

cmp1 (Bitwise) : 35.39 34.45 34.69 35.99 37.86 34.09 37.76 36.38 33.99 35.13 cycles/call
cmp2 (Addition): 33.77 32.97 34.51 34.13 36.12 32.70 33.33 33.22 33.22 33.26 cycles/call
cmp3 (int128) : 34.39 33.62 32.77 34.20 36.09 33.99 32.76 35.62 32.45 34.73 cycles/call

Sink total: 22281024

gcc (15.2.0) -m64 -O2 -march=native

The revised asm output (with the corrected <= condition) is slightly
different, but not substantially:
.globl cmp1
cmp1:
movq %rdx, %r10
movq .refptr.t(%rip), %rdx
movq %rcx, %rax
salq $5, %rax
addq %rdx, %rax
movq 24(%rax), %r11
addq $1, %rcx
salq $5, %rcx
addq %rcx, %rdx
movq 24(%rdx), %rcx
cmpq %r8, 16(%rax)
setnb %al
cmpq %r10, %r11
sete %r8b
andl %r8d, %eax
cmpq %rcx, %r9

setb %r8b
orl %r8d, %eax

cmpq %r11, %r10

setb %r8b
orl %r8d, %eax

movq 40(%rsp), %r10
cmpq %r10, 16(%rdx)
setnb %dl

cmpq %r9, %rcx
sete %cl
andl %ecx, %edx
orl %edx, %eax

ret
.globl cmp2
cmp2:
movq %rdx, %r11

leaq 1(%rcx), %r10
movq .refptr.t(%rip), %rdx

salq $5, %rcx
addq %rdx, %rcx

cmpq %r8, 16(%rcx)
movq 24(%rcx), %rax
sbbq $-1, %rax

cmpq %rax, %r11
setb %al
salq $5, %r10
addq %r10, %rdx

movq 40(%rsp), %rcx
cmpq %rcx, 16(%rdx)
movq 24(%rdx), %rdx
sbbq $-1, %rdx

cmpq %rdx, %r9
setb %dl
orl %edx, %eax
ret
.globl cmp3
cmp3:

pushq %rbx
leaq 1(%rcx), %rbx
movq %rdx, %r11
salq $5, %rcx
addq .refptr.t(%rip), %rcx
movq 24(%rcx), %rdx
cmpq %r8, 16(%rcx)
movq %rdx, %rax
sbbq %r11, %rax
setnc %al
salq $5, %rbx
movq %rbx, %rdx
addq .refptr.t(%rip), %rdx
movq 24(%rdx), %r11
movq 48(%rsp), %rbx
cmpq %rbx, 16(%rdx)
movq %r11, %rbx
sbbq %r9, %rbx
setnc %dl

benchmark.c

Fredrik Johansson

unread,

Apr 6, 2026, 7:54:24 PMApr 6

to David Sparks, Edgar Costa, flint...@googlegroups.com

Hi David,

Interesting, thanks for the suggestion. In the <=82 bit range, the speedup on the primality test when using cmp3 instead of cmp1 is around 1-3% for random input.

We can actually do even better, potentially.

Suppose instead of comparing (ahi * 2^64 + alo) <= (bhi * 2^64 + blo), we just do ahi < bhi. This is now up to 5% faster than the primality test using cmp1.

A priori this is now an inexact divisibility test which might allow some composites to slip through, though this isn't a disaster as long as such composites are sufficiently rare, since they will be caught by the base-2 sprp test later anyway.

Testing this, it seems that failures only become frequent when the high word is very close to UWORD_MAX. Empirically, for input nhi * 2^64 + n_randlimb(), the probability that this inexact trial division by 64 primes doesn't give the same result as the exact trial division by 64 primes seems to be:

nhi = UWORD_MAX -> 80%
nhi = (UWORD_MAX - 1) -> 30%
nhi = (UWORD_MAX - 2) -> 22%
...
nhi = (UWORD_MAX - 264) -> 0.071%

For nhi <= UWORD_MAX - 265 I don't get *any* failures after trying millions of random inputs.

Let's see what happens if we set an artificially small limb size 2^bits and check by brute force where testing (odd) double-limb x for divisibility by (odd) single-limb d gives the wrong result:

#include "ulong_extras.h"

int main()
{
ulong bits, B, B2, d, x, inv1d, inv2d, minx, minxd;
int divis1, divis2;

bits = 4; // Bits in limb
B = UWORD(1) << bits; // Limb radix
B2 = UWORD(1) << (2 * bits); // Double limb radix

minx = B2;
minxd = 0;

// Check all odd single-limb divisors
for (d = 3; d < B; d += 2)
{
inv1d = n_binvert(d);
inv2d = (B2 - 1) / d;

// Check all odd double-limb integers
for (x = 3; x < B2; x += 2)
{
// Exact divisibility of x by d
divis1 = ((x * inv1d) & (B2 - 1)) <= inv2d;
// Inexact(?) divisibility of x by d comparing only the top limb
divis2 = (((x * inv1d) & (B2 - 1)) >> bits) < (inv2d >> bits);

if (divis1 != divis2)
{
if (x < minx)
{
minx = x;
minxd = d;
}

flint_printf("d = %wu fails at x = %wu (%.8f B^2)\n",
d, x, x / (double) B2);
break;
}
}
}

flint_printf("smallest failing x: %wu (%.8f B^2) (for d = %wu)\n",
minx, minx / (double) B2, minxd);
}

Example output for 4-bit limbs:

d = 3 fails at x = 243 (0.949219 B^2)
d = 5 fails at x = 245 (0.957031 B^2)
d = 7 fails at x = 231 (0.902344 B^2)
d = 9 fails at x = 153 (0.597656 B^2)
d = 11 fails at x = 187 (0.730469 B^2)
d = 13 fails at x = 221 (0.863281 B^2)
d = 15 fails at x = 255 (0.996094 B^2)
smallest failing x: 153 (0.597656 B^2) (for d = 9)

Conjecture 1: this sloppy version of the 2x1 limb divisibility test is actually exact for x < 2^(2*bits-1) (e.g. for all 127-bit integers when we have 64-bit limbs).

Numerical evidence:

bits = 12: smallest failing x: 8394753 (0.500366 B^2) (for d = 2049)
bits = 13: smallest failing x: 33566721 (0.500183 B^2) (for d = 4097)
bits = 14: smallest failing x: 134242305 (0.500092 B^2) (for d = 8193)
...

Conjecture 2: for bounded d <= D, the test is exact for all x < (1 - epsilon) * 2^(2*bits), where epsilon converges (rapidly) to zero as a function of the bit size (of order 2^(-bits)?).

Numerical evidence:

8-bit limbs:

d = 3 fails at x = 65283 (0.99613953 x B^2)
d = 5 fails at x = 65285 (0.99617004 x B^2)
d = 7 fails at x = 64519 (0.98448181 x B^2)
d = 9 fails at x = 64521 (0.98451233 x B^2)
d = 11 fails at x = 64779 (0.98844910 x B^2)
d = 13 fails at x = 63245 (0.96504211 x B^2)
d = 15 fails at x = 65295 (0.99632263 x B^2)
d = 17 fails at x = 65297 (0.99635315 x B^2)
...

16-bit limbs:

d = 3 fails at x = 4294901763 (0.99998474 x B^2)
d = 5 fails at x = 4294901765 (0.99998474 x B^2)
d = 7 fails at x = 4294836231 (0.99996948 x B^2)
d = 9 fails at x = 4294508553 (0.99989319 x B^2)
d = 11 fails at x = 4294377483 (0.99986267 x B^2)
d = 13 fails at x = 4294770701 (0.99995423 x B^2)
d = 15 fails at x = 4294901775 (0.99998474 x B^2)
d = 17 fails at x = 4294901777 (0.99998475 x B^2)
...

20-bit limbs:

d = 3 fails at x = 1099510579203 (0.99999905 x B^2)

d = 5 fails at x = 1099510579205 (0.99999905 x B^2)

d = 7 fails at x = 1099507433479 (0.99999619 x B^2)

...

Question: can one work out some effective, explicit bound for epsilon as a function of bits and D? For example, can we give an explicit bound for epsilon when bits = 64 and D = 2^8 or D = 2^16?

Neither conjecture needs a proof to be used in a primality test, since we ultimately fall back on other methods as noted above. However, proofs would allow using this slightly faster divisibility test in other settings where exactness does matter.

Fredrik

David Sparks

unread,

Apr 7, 2026, 3:24:04 PMApr 7

to Fredrik Johansson, Edgar Costa, flint...@googlegroups.com

On Monday, April 6th, 2026 at 23:54, Fredrik Johansson <fredrik....@gmail.com> wrote:

> Conjecture 1: this sloppy version of the 2x1 limb divisibility test is actually exact for x < 2^(2*bits-1) (e.g. for all 127-bit integers when we have 64-bit limbs).

Really excellent thinking!

This part is easy. If x is a multiple of an odd divisor d,
multiplication by the inverse actually computes the quotient x/d.

So if x is bounded, x/d will be bounded, too.

Let B = floor((4^bits - 1) / d) be the maximum possible
quotient, produced when x = d*B.

Let B_lo be the low k bits of B. As long as x/d < B - B_lo,
testing only the high (bits-k) bits will suffice.

In other words, x < d*B - d*B_lo. We know that B_lo <= 2^k - 1, and
d*B >= (2^bits - d), so
x < 4^bits - d - d*(2^k - 1)
= 4^bits - d*2^k
is a safe bound.

(I used an arbitrary k so we can consider things like testing only
the upper 32 bits.)

> bits = 12: smallest failing x: 8394753 (0.500366 B^2) (for d = 2049)
> bits = 13: smallest failing x: 33566721 (0.500183 B^2) (for d = 4097)
> bits = 14: smallest failing x: 134242305 (0.500092 B^2) (for d = 8193)

These basically match the bound I computed. When d*2^k reaches
half of 2^(2k-1), i.e. d >= d^k-1, the first failure comes
at (almost) 2^(2k-1) = B/2.

> Conjecture 2: for bounded d <= D, the test is exact for all x < (1 - epsilon) * 2^(2*bits), where epsilon converges (rapidly) to zero as a function of the bit size (of order 2^(-bits)?).

The same formula applies. The test is exact as long as
x < 4^bits - d*2^k
<= 4^bits - D*2^k
= 4^bits * (1 - D*2^(k-2*bits))
= 4^bits * (1 - D/2^(2*bits-k))

This is using my slight overestimate; you could push slightly
for specific values of D based on the actual values of B_lo, but
you don't gain a lot. For a single divisor d, you can on average
cut epsilon in half, but since you're considering all (odd prime)
d < D, the bound is the minimum of the individual bounds.

> Neither conjecture needs a proof to be used in a primality test, since we ultimately fall back on other methods as noted above. However, proofs would allow using this slightly faster divisibility test in other settings where exactness does matter.

Hopefully the above helps.

Fredrik Johansson

unread,

Apr 8, 2026, 3:49:35 AMApr 8

to flint-devel

That works nicely, thanks. Now implemented in https://github.com/flintlib/flint/pull/2629

Next step will be to use this divisibility test for factoring too.

Fredrik

David Sparks

unread,

Apr 8, 2026, 7:32:44 AMApr 8

to flint...@googlegroups.com

Another way to write the full-range test is:

if (unlikely((a1 <= abound1) | (b1 < bbound1)) &&
((n_ll_le(...) | n_ll_le(...))

For a single test, it can be written very nicely:
if (unlikely(a1 <= abound1) && (a1 < abound1 || a0 <= abound0))
... but that elegance disappears when we doing two tests in
parallel.

> + #if FLINT_BITS == 64 && defined(__GNUC__)

Also, rather than __GNUC__, using __SIZEOF_INT128__ tests if this
particular platform and compiler version has __int128 support.

> + typedef struct {
> + const char *s;
> + int expect;
> + } testcase;

It totally doesn't matter for such a small number of tests, but
one trick I've used in such test code is to place the expected
result in the first character of the string. E.g.

/* The first character is the expected result, -1, 0, or +1 */
static const char * testcases[] = {
"\377" "3404730287403079539471001", /* oversize */
"\377" "3317044064679887385961981", /* oversize */
"\0" "3110269097300703345712981",
...
};

expect = (signed char)testcases[i][0];
fmpz_set_str(f, testcases[i]+1, 10);

This avoids a lot of unnecessary alignment padding which bloats
"expect" to 64 bits when it only needs 2. The space is unimportant
for test code, but using string-pasting as demonstrated above can
make it quite legible.

Of course, if you really cared about space, you could skip the array
of pointers as well and just run all the test cases together:

/* \0-separated test cases. The first character of each is
the expected result (-1, 0 or +1) plus two.
*/
static const char testcases[] =
"\1" "3404730287403079539471001" /* oversize */
"\0\1" "3317044064679887385961981" /* oversize */
"\0\2" "3110269097300703345712981"
...
"\0"; /* Implicit trailing \0 (two in a row) ends the list */

for (char const *t = testcases, *t; t += strlen(t)+1) {
int expected = *t++ - 2;
fmpz_set_str(f, t, 10);
...
}

David Sparks

unread,

Apr 10, 2026, 2:03:14 PMApr 10

to Fredrik Johansson, flint...@googlegroups.com

In commit e371e113bd1428f016b8dbbba2b43c6fb59ecb3f, part of the added code is

+ /* Let d be the largest trial prime. If x < 2^(2*FLINT_BITS) - d*2^FLINT_BITS, then
+ a < comparison of the high limbs in the Granlund-Montgomery divisibility test
+ is guaranteed to give the same result as a <= comparison of both limbs. */
+ #define FAST_TRIAL_BOUND (UWORD_MAX - 313)

+ if (nhi < FAST_TRIAL_BOUND)

The bound is 1 lower than it needs to be; either FAST_TRIAL_BOUND can be
exclusive at -UWORD(313), or the comparison can be (nhi <= FAST_TRIAL_BOUND).

If n == 2^128 - 1 (nhi == nlo == UWORD_MAX), then the quotient n/a would
equal abound == (abound1<<64) + abound0. So obviously, the quotient
(n-(a<<64))/a == n/a - (1<<64) == ((abound1-1)<<64) + abound0.

Meaning that if nhi <= UWORD_MAX-1-a, we are guaranteed that a1 < abound1.

I'd phrase the comment to actually include the proof:
+ /* Let d be a trial divisor. If n <= 2^(2*FLINT_BITS) - 1 - d*2^FLINT_BITS,
+ then the quotient n/d will be at least 2^64 less than the maximum possible,
+ meaning that the high word of the quotient will be strictly less than the
+ bound, so we can do a < test on the high word only and skip the low word. */
but that's up to you.

A few other minor details:

* The first trial division loop (by odd primes up to 137) could use
a higher FAST_TRIAL_BOUND

* n_ll_is_prime() has parameters nhi/nlo, which are copied to n1/n0 and
then not modified. The compiler can figure this out easily enough, but
surely it's easier for a human reader to only have one naming convention?

* You might also consider
- if (n1 > SWbound[1] || (n1 == SWbound[1] && n0 >= SWbound[0]))
+ if (n_ll_le(SWbound[1], SWbound[0], n1, n0))

* The comment
/* 6 * 2 * (357913941 * 2^64) < 2^192, so we can use the fast
base-2 Miller-Rabin code */
is, of course, wrong (it's 2^96, not 2^192), but you might write the
condition more expressively as
- if (n1 < UWORD(357913941))
+ if (n1 < (UWORD(1) << FLINT_BITS/2) / 12)
but really n_ll_small_2_powmod() needs a comment explaining its range limitation.
Something like
/* Specialization for b = 2, for m up to 92.415 bits.
The small squaring requires its input to fit into 1.5 words, and _sloppy
reduction only guarantees a result less than 6*m. We also multiply by the
base (2), so the bound is m <= 0x155555555555555555555555 < 2^96/12, which
is approximated by testing m[1] < 0x15555555. */

* You could simplify the small_sprp_stage2 code. Instead of faffing about with
the "res" flag, just use multiple return statements:

/* Perform phase 2 of a M-R test, where n-1 has s trailing zeros */
static
int n_ll_small_sprp_stage2(nn_srcptr y, slong s, nn_srcptr n, nn_srcptr ninv)
{
ulong y0 = y[0], y1 = y[1];

if (y0 == 1 && y1 == 0)
return 1;
while (y0 != nm1[0]-1 || y1 != n[1])
{
if (--s == 0)
return 0;
n_ll_small_sqrmod(&y1, &y0, y1, y0, n[1], n[0], ninv[1], ninv[0]);
}
return 1;
}

* You might add a similar comment to n_ll_small_powmod_triple(), something like
/* This has a ~93-bit upper bound similar to to n_ll_small_2_powmod(), which is
not a problem because it's only used for values below SWbound < 2^81.5. */

* There are several cases where results are returned into local
variables and then immediately copied to output pointers.
This could be simplified, e.g.
void n_ll_small_reduce3_sloppy(ulong * r1, ulong *r0,
ulong x2, ulong x1, ulong x0,
ulong m1, ulong m0, ulong minv1, ulong minv0)
{
ulong q1, q0, t1, t0;
mulhi_2x2_sloppy(&q1, &q0, x2, x1, minv1, minv0);
mullo_2x2(&t1, &t0, q1, q0, m1, m0);
- sub_ddmmss(x1, x0, x1, x0, t1, t0);
- *r0 = x0;
- *r1 = x1;
+ sub_ddmmss(*r1, *r0, x1, x0, t1, t0);
}

* n_ll_small_preinv() is *not* such a case, as qq[]
needs to be three words while minv[] is only two, but
it could become one if you used a 3-word all-ones
dividend. Since m is odd and the remainder is
discarded, 2^192/m == (2^192-1)/m.

Reply all

Reply to author

Forward