uint32_t rotl32c (uint32_t x, uint32_t n)
{
assert
(n<32);
return
(x<<n) | (x>>(-n&31));
}
and variants like this are very efficient in C, especially for fixed values of n:
#include <stdint.h>
uint64_t rol1(uint64_t x)
{
return (x << 1) | (x >> 63);
}
uint64_t ror1(uint64_t x)
{
return (x >> 1) | (x << 63);
}
uint64_t ror3(uint64_t x)
{
return (x >> 3) | (x << 61);
}
rol1(unsigned long):
mov rax, rdi
rol rax
ret
ror1(unsigned long):
mov rax, rdi
ror rax
ret
ror3(unsigned long):
mov rax, rdi
ror rax, 3
ret
but if I try to do it in numba, I get this:
@numba.njit(numba.u8(numba.u8))
def rol1(x):
return (x << 1) | (x >> 63)
print rol1.inspect_asm().values()[0]
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 11
.globl ___main__.rol1$1.uint64
.align 4, 0x90
___main__.rol1$1.uint64:
leaq (%rcx,%rcx), %rax
sarq $63, %rcx
orq %rcx, %rax
movq %rax, (%rdi)
xorl %eax, %eax
retq
I suppose with a very short function like that it doesn't matter due to the interfacing between Python and numba-compiled code, but if the rotate code is within a hot loop in larger section of numba-compiled code, then it would helpful to understand how to write it efficiently.