n_ll_is_prime trial division

13 views
Skip to first unread message

David Sparks

unread,
Apr 5, 2026, 11:20:52 AM (4 days ago) Apr 5
to flint-devel, Fredrik Johansson, David Sparks
The code comment says
On Zen 3, unrolling by two and combining comparisons bitwise
is faster than a more branchy version.

Here are a few, progressively faster, different ways of doing the
comparison (excerpted into a standalone test function to make the
generated code easier to read):

#include <stdbool.h>
#include <stdint.h>
typedef uint64_t ulong;

extern ulong t[][4]; // The ll_trial_primes array
bool cmp1(size_t i, ulong a1, ulong a0, ulong b1, ulong b0)
{
ulong v0 = t[i][2];
ulong v1 = t[i][3];

ulong w0 = t[i+1][2];
ulong w1 = t[i+1][3];

return (a1 < v1) | ((a1 == v1) & (a0 < v0)) |
(b1 < w1) | ((b1 == w1) & (b0 < w0));
}

bool cmp2(size_t i, ulong a1, ulong a0, ulong b1, ulong b0)
{
ulong v0 = t[i][2];
ulong v1 = t[i][3];

ulong w0 = t[i+1][2];
ulong w1 = t[i+1][3];

return (a1 < v1 + (a0 < v0)) |
(b1 < w1 + (b0 < w0));
}

#ifdef __SIZEOF_INT128__
bool cmp3(size_t i, ulong a1, ulong a0, ulong b1, ulong b0)
{
unsigned __int128 a = (unsigned __int128)a1 << 64 | a0;
unsigned __int128 b = (unsigned __int128)b1 << 64 | b0;
unsigned __int128 v = (unsigned __int128)t[i][3] << 64 | t[i][2];
unsigned __int128 w = (unsigned __int128)t[i+1][3] << 64 | t[i+1][2];

return (a < v) ! (b < w);
}
#endif

GCC 15.2.0 compiles these to (boilerplate trimmed):

.file "cmp.c"
.text
.globl cmp1
cmp1:
pushq %rbx
movq .refptr.t(%rip), %rax
movq %rcx, %r10
addq $1, %rcx
movq %r8, %rbx
movq %rdx, %r11
salq $5, %r10
salq $5, %rcx
addq %rax, %r10
leaq (%rax,%rcx), %rdx
movq 24(%r10), %r8
cmpq 16(%r10), %rbx
movq 24(%rdx), %rcx
setb %al
movq 16(%rdx), %rbx
cmpq %r11, %r8
sete %r10b
andl %r10d, %eax
cmpq %rcx, %r9
setb %r10b
orl %r10d, %eax
cmpq %r8, %r11
setb %r8b
orl %r8d, %eax
cmpq %rbx, 48(%rsp)
setb %dl
cmpq %r9, %rcx
sete %cl
andl %ecx, %edx
orl %edx, %eax
popq %rbx
ret
.globl cmp2
cmp2:
movq %rdx, %r11
movq .refptr.t(%rip), %rdx
leaq 1(%rcx), %r10
salq $5, %rcx
addq %rdx, %rcx
movq 24(%rcx), %rax
cmpq 16(%rcx), %r8
adcq $0, %rax
cmpq %rax, %r11
setb %al
salq $5, %r10
addq %r10, %rdx
movq 16(%rdx), %rcx
movq 24(%rdx), %rdx
cmpq %rcx, 40(%rsp)
adcq $0, %rdx
cmpq %rdx, %r9
setb %dl
orl %edx, %eax
ret
.globl cmp3
cmp3:
movq .refptr.t(%rip), %r10
leaq 1(%rcx), %r11
salq $5, %rcx
addq %r10, %rcx
cmpq 16(%rcx), %r8
sbbq 24(%rcx), %rdx
setc %al
salq $5, %r11
addq %r11, %r10
movq 16(%r10), %rdx
cmpq %rdx, 40(%rsp)
sbbq 24(%r10), %r9
setc %dl
orl %edx, %eax
ret

cmp2 works because we know that v1 and w1 are not -1, so we
can add a boolean to them without risk of overflow.

But cmp3 is yet faster, and there's a preprocessor symbol that
it can be conditioned on.

David Sparks

unread,
Apr 5, 2026, 6:59:16 PM (4 days ago) Apr 5
to Edgar Costa, flint...@googlegroups.com, Fredrik Johansson
Thank you for the measurements! One mistake I made was to
write my examples as "product < limit", when the original
code, and what we want, is "product <= limit". With that
tweak to benchmark.c (and the RNG improved; rand() only
return 31 bits so "rand() << 32 | rand()" only gives
you 62 bits), my i5-4570 gives:

Initializing data...
Running benchmark (10000000 iterations)...

cmp1 (Bitwise) : 35.39 34.45 34.69 35.99 37.86 34.09 37.76 36.38 33.99 35.13 cycles/call
cmp2 (Addition): 33.77 32.97 34.51 34.13 36.12 32.70 33.33 33.22 33.22 33.26 cycles/call
cmp3 (int128) : 34.39 33.62 32.77 34.20 36.09 33.99 32.76 35.62 32.45 34.73 cycles/call

Sink total: 22281024

gcc (15.2.0) -m64 -O2 -march=native

The revised asm output (with the corrected <= condition) is slightly
different, but not substantially:
.globl cmp1
cmp1:
movq %rdx, %r10
movq .refptr.t(%rip), %rdx
movq %rcx, %rax
salq $5, %rax
addq %rdx, %rax
movq 24(%rax), %r11
addq $1, %rcx
salq $5, %rcx
addq %rcx, %rdx
movq 24(%rdx), %rcx
cmpq %r8, 16(%rax)
setnb %al
cmpq %r10, %r11
sete %r8b
andl %r8d, %eax
cmpq %rcx, %r9
setb %r8b
orl %r8d, %eax
cmpq %r11, %r10
setb %r8b
orl %r8d, %eax
movq 40(%rsp), %r10
cmpq %r10, 16(%rdx)
setnb %dl
cmpq %r9, %rcx
sete %cl
andl %ecx, %edx
orl %edx, %eax
ret
.globl cmp2
cmp2:
movq %rdx, %r11
leaq 1(%rcx), %r10
movq .refptr.t(%rip), %rdx
salq $5, %rcx
addq %rdx, %rcx
cmpq %r8, 16(%rcx)
movq 24(%rcx), %rax
sbbq $-1, %rax
cmpq %rax, %r11
setb %al
salq $5, %r10
addq %r10, %rdx
movq 40(%rsp), %rcx
cmpq %rcx, 16(%rdx)
movq 24(%rdx), %rdx
sbbq $-1, %rdx
cmpq %rdx, %r9
setb %dl
orl %edx, %eax
ret
.globl cmp3
cmp3:
pushq %rbx
leaq 1(%rcx), %rbx
movq %rdx, %r11
salq $5, %rcx
addq .refptr.t(%rip), %rcx
movq 24(%rcx), %rdx
cmpq %r8, 16(%rcx)
movq %rdx, %rax
sbbq %r11, %rax
setnc %al
salq $5, %rbx
movq %rbx, %rdx
addq .refptr.t(%rip), %rdx
movq 24(%rdx), %r11
movq 48(%rsp), %rbx
cmpq %rbx, 16(%rdx)
movq %r11, %rbx
sbbq %r9, %rbx
setnc %dl
benchmark.c

Fredrik Johansson

unread,
Apr 6, 2026, 7:54:24 PM (3 days ago) Apr 6
to David Sparks, Edgar Costa, flint...@googlegroups.com
Hi David,

Interesting, thanks for the suggestion. In the <=82 bit range, the speedup on the primality test when using cmp3 instead of cmp1 is around 1-3% for random input.

We can actually do even better, potentially.

Suppose instead of comparing (ahi * 2^64 + alo) <= (bhi * 2^64 + blo), we just do ahi < bhi. This is now up to 5% faster than the primality test using cmp1.

A priori this is now an inexact divisibility test which might allow some composites to slip through, though this isn't a disaster as long as such composites are sufficiently rare, since they will be caught by the base-2 sprp test later anyway.

Testing this, it seems that failures only become frequent when the high word is very close to UWORD_MAX. Empirically, for input nhi * 2^64 + n_randlimb(), the probability that this inexact trial division by 64 primes doesn't give the same result as the exact trial division by 64 primes seems to be:

nhi = UWORD_MAX -> 80%
nhi = (UWORD_MAX - 1) -> 30%
nhi = (UWORD_MAX - 2) -> 22%
...
nhi = (UWORD_MAX - 264) -> 0.071%

For nhi <= UWORD_MAX - 265 I don't get *any* failures after trying millions of random inputs.


Let's see what happens if we set an artificially small limb size 2^bits and check by brute force where testing (odd) double-limb x for divisibility by (odd) single-limb d gives the wrong result:

#include "ulong_extras.h"

int main()
{
    ulong bits, B, B2, d, x, inv1d, inv2d, minx, minxd;
    int divis1, divis2;

    bits = 4;  // Bits in limb
    B = UWORD(1) << bits;   // Limb radix
    B2 = UWORD(1) << (2 * bits);    // Double limb radix

    minx = B2;
    minxd = 0;

    // Check all odd single-limb divisors
    for (d = 3; d < B; d += 2)
    {
        inv1d = n_binvert(d);
        inv2d = (B2 - 1) / d;

        // Check all odd double-limb integers
        for (x = 3; x < B2; x += 2)
        {
            // Exact divisibility of x by d
            divis1 = ((x * inv1d) & (B2 - 1)) <= inv2d;
            // Inexact(?) divisibility of x by d comparing only the top limb
            divis2 = (((x * inv1d) & (B2 - 1)) >> bits) < (inv2d >> bits);

            if (divis1 != divis2)
            {
                if (x < minx)
                {
                    minx = x;
                    minxd = d;
                }

                flint_printf("d = %wu  fails at x = %wu  (%.8f B^2)\n",
                    d, x, x / (double) B2);
                break;
            }
        }
    }

    flint_printf("smallest failing x: %wu  (%.8f B^2)  (for d = %wu)\n",
        minx, minx / (double) B2, minxd);
}



Example output for 4-bit limbs:

d = 3  fails at x = 243  (0.949219 B^2)
d = 5  fails at x = 245  (0.957031 B^2)
d = 7  fails at x = 231  (0.902344 B^2)
d = 9  fails at x = 153  (0.597656 B^2)
d = 11  fails at x = 187  (0.730469 B^2)
d = 13  fails at x = 221  (0.863281 B^2)
d = 15  fails at x = 255  (0.996094 B^2)
smallest failing x: 153  (0.597656 B^2)  (for d = 9)


Conjecture 1: this sloppy version of the 2x1 limb divisibility test is actually exact for x < 2^(2*bits-1) (e.g. for all 127-bit integers when we have 64-bit limbs).

Numerical evidence:

bits = 12: smallest failing x: 8394753  (0.500366 B^2)  (for d = 2049)
bits = 13: smallest failing x: 33566721  (0.500183 B^2)  (for d = 4097)
bits = 14: smallest failing x: 134242305  (0.500092 B^2)  (for d = 8193)
...

Conjecture 2: for bounded d <= D, the test is exact for all x < (1 - epsilon) * 2^(2*bits), where epsilon converges (rapidly) to zero as a function of the bit size (of order 2^(-bits)?).

Numerical evidence:

8-bit limbs:

d = 3  fails at x = 65283  (0.99613953 x B^2)
d = 5  fails at x = 65285  (0.99617004 x B^2)
d = 7  fails at x = 64519  (0.98448181 x B^2)
d = 9  fails at x = 64521  (0.98451233 x B^2)
d = 11  fails at x = 64779  (0.98844910 x B^2)
d = 13  fails at x = 63245  (0.96504211 x B^2)
d = 15  fails at x = 65295  (0.99632263 x B^2)
d = 17  fails at x = 65297  (0.99635315 x B^2)
...

16-bit limbs:

d = 3  fails at x = 4294901763  (0.99998474 x B^2)
d = 5  fails at x = 4294901765  (0.99998474 x B^2)
d = 7  fails at x = 4294836231  (0.99996948 x B^2)
d = 9  fails at x = 4294508553  (0.99989319 x B^2)
d = 11  fails at x = 4294377483  (0.99986267 x B^2)
d = 13  fails at x = 4294770701  (0.99995423 x B^2)
d = 15  fails at x = 4294901775  (0.99998474 x B^2)
d = 17  fails at x = 4294901777  (0.99998475 x B^2)
...

20-bit limbs:

d = 3  fails at x = 1099510579203  (0.99999905 x B^2)
d = 5  fails at x = 1099510579205  (0.99999905 x B^2)
d = 7  fails at x = 1099507433479  (0.99999619 x B^2)
...


Question: can one work out some effective, explicit bound for epsilon as a function of bits and D? For example, can we give an explicit bound for epsilon when bits = 64 and D = 2^8 or D = 2^16?

Neither conjecture needs a proof to be used in a primality test, since we ultimately fall back on other methods as noted above. However, proofs would allow using this slightly faster divisibility test in other settings where exactness does matter.

Fredrik

David Sparks

unread,
Apr 7, 2026, 3:24:04 PM (2 days ago) Apr 7
to Fredrik Johansson, Edgar Costa, flint...@googlegroups.com
On Monday, April 6th, 2026 at 23:54, Fredrik Johansson <fredrik....@gmail.com> wrote:

> Conjecture 1: this sloppy version of the 2x1 limb divisibility test is actually exact for x < 2^(2*bits-1) (e.g. for all 127-bit integers when we have 64-bit limbs).

Really excellent thinking!

This part is easy. If x is a multiple of an odd divisor d,
multiplication by the inverse actually computes the quotient x/d.

So if x is bounded, x/d will be bounded, too.

Let B = floor((4^bits - 1) / d) be the maximum possible
quotient, produced when x = d*B.

Let B_lo be the low k bits of B. As long as x/d < B - B_lo,
testing only the high (bits-k) bits will suffice.

In other words, x < d*B - d*B_lo. We know that B_lo <= 2^k - 1, and
d*B >= (2^bits - d), so
x < 4^bits - d - d*(2^k - 1)
= 4^bits - d*2^k
is a safe bound.

(I used an arbitrary k so we can consider things like testing only
the upper 32 bits.)

> bits = 12: smallest failing x: 8394753 (0.500366 B^2) (for d = 2049)
> bits = 13: smallest failing x: 33566721 (0.500183 B^2) (for d = 4097)
> bits = 14: smallest failing x: 134242305 (0.500092 B^2) (for d = 8193)

These basically match the bound I computed. When d*2^k reaches
half of 2^(2k-1), i.e. d >= d^k-1, the first failure comes
at (almost) 2^(2k-1) = B/2.

> Conjecture 2: for bounded d <= D, the test is exact for all x < (1 - epsilon) * 2^(2*bits), where epsilon converges (rapidly) to zero as a function of the bit size (of order 2^(-bits)?).

The same formula applies. The test is exact as long as
x < 4^bits - d*2^k
<= 4^bits - D*2^k
= 4^bits * (1 - D*2^(k-2*bits))
= 4^bits * (1 - D/2^(2*bits-k))

This is using my slight overestimate; you could push slightly
for specific values of D based on the actual values of B_lo, but
you don't gain a lot. For a single divisor d, you can on average
cut epsilon in half, but since you're considering all (odd prime)
d < D, the bound is the minimum of the individual bounds.

> Neither conjecture needs a proof to be used in a primality test, since we ultimately fall back on other methods as noted above. However, proofs would allow using this slightly faster divisibility test in other settings where exactness does matter.

Hopefully the above helps.

Fredrik Johansson

unread,
Apr 8, 2026, 3:49:35 AM (yesterday) Apr 8
to flint-devel
That works nicely, thanks. Now implemented in https://github.com/flintlib/flint/pull/2629

Next step will be to use this divisibility test for factoring too.

Fredrik

David Sparks

unread,
Apr 8, 2026, 7:32:44 AM (yesterday) Apr 8
to flint...@googlegroups.com
Another way to write the full-range test is:

if (unlikely((a1 <= abound1) | (b1 < bbound1)) &&
((n_ll_le(...) | n_ll_le(...))

For a single test, it can be written very nicely:
if (unlikely(a1 <= abound1) && (a1 < abound1 || a0 <= abound0))
... but that elegance disappears when we doing two tests in
parallel.

> + #if FLINT_BITS == 64 && defined(__GNUC__)

Also, rather than __GNUC__, using __SIZEOF_INT128__ tests if this
particular platform and compiler version has __int128 support.

> + typedef struct {
> + const char *s;
> + int expect;
> + } testcase;

It totally doesn't matter for such a small number of tests, but
one trick I've used in such test code is to place the expected
result in the first character of the string. E.g.

/* The first character is the expected result, -1, 0, or +1 */
static const char * testcases[] = {
"\377" "3404730287403079539471001", /* oversize */
"\377" "3317044064679887385961981", /* oversize */
"\0" "3110269097300703345712981",
...
};

expect = (signed char)testcases[i][0];
fmpz_set_str(f, testcases[i]+1, 10);

This avoids a lot of unnecessary alignment padding which bloats
"expect" to 64 bits when it only needs 2. The space is unimportant
for test code, but using string-pasting as demonstrated above can
make it quite legible.

Of course, if you really cared about space, you could skip the array
of pointers as well and just run all the test cases together:

/* \0-separated test cases. The first character of each is
the expected result (-1, 0 or +1) plus two.
*/
static const char testcases[] =
"\1" "3404730287403079539471001" /* oversize */
"\0\1" "3317044064679887385961981" /* oversize */
"\0\2" "3110269097300703345712981"
...
"\0"; /* Implicit trailing \0 (two in a row) ends the list */

for (char const *t = testcases, *t; t += strlen(t)+1) {
int expected = *t++ - 2;
fmpz_set_str(f, t, 10);
...
}

Reply all
Reply to author
Forward
0 new messages