Michael Clark
unread,Mar 25, 2016, 4:42:41 PM3/25/16Sign in to reply to author
Sign in to forward
You do not have permission to delete messages in this group
Sign in to report message
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to RISC-V Software Developers
Hi All,
I have found an interesting case where an optimisation is not being applied by GCC on RISC-V. And also some strange assembly output from GCC on RISC-V.
Both GCC and Clang appear to optimise division by a constant Mersenne prime on x86_64 however GCC on RISC-V is not applying this optimisation.
See test program and assembly output for these platforms:
* GCC -O3 on RISC-V
* GCC -O3 on x86_64
* LLVM/Clang -O3 on x86_64
Another strange observation is GCC on RISC-V is moving a1 to a5 via a stack store followed by a stack load. Odd? GCC 5 also seems to be doing odd stuff with stack ‘moves' on x86_64, moving esi to ecx via the stack (I think recent x86 micro-architecture treats tip of the stack like an extended register file so this may only have a small penalty on x86).
See GCC on RISC-V is emitting this:
test_0:
add sp,sp,-16
sw a1,12(sp)
lw a5,12(sp)
add sp,sp,16
remuw a0,a0,a5
jr ra
instead of this:
test_0:
remuw a0,a0,a1
jr ra
Compiler devs, please read Test program and assembly output. I have not yet tested LLVM/Clang on RISC-V yet… I will do that next… I have not had time to dig into compiler code yet...
Regards,
Michael.
/* Test program */
#include <stdio.h>
#include <limits.h>
static const int p = 8191;
static const int s = 13;
int __attribute__ ((noinline)) test_0(unsigned int k, volatile int p)
{
return k % p;
}
int __attribute__ ((noinline)) test_1(unsigned int k)
{
return k % p;
}
int __attribute__ ((noinline)) test_2(unsigned int k)
{
int i = (k&p) + (k>>s);
i = (i&p) + (i>>s);
if (i>=p) i -= p;
return i;
}
int main()
{
test_0(1, 8191); /* control */
for (int i = INT_MIN; i < INT_MAX; i++) {
int r1 = test_1(i), r2 = test_2(i);
if (r1 != r2) printf("%d %d %d\n", i, r1, r2);
}
}
/* RISC-V GCC */
$ riscv64-unknown-elf-gcc --version
riscv64-unknown-elf-gcc (GCC) 5.2.0
test_0:
add sp,sp,-16
sw a1,12(sp)
lw a5,12(sp)
add sp,sp,16
remuw a0,a0,a5
jr ra
test_1:
li a5,8192
addw a5,a5,-1
remuw a0,a0,a5
ret
test_2:
li a3,8192
addw a2,a3,-1
and a4,a0,a2
srlw a0,a0,13
addw a5,a4,a0
and a0,a5,a2
sraw a5,a5,13
addw a0,a0,a5
addw a3,a3,-2
ble a0,a3,.L5
subw a0,a0,a2
.L5:
ret
/* Linux x86_64 GCC */
$ gcc --version
gcc (Debian 5.2.1-23) 5.2.1 20151028
test_0:
mov DWORD PTR [rsp-4], esi
mov ecx, DWORD PTR [rsp-4]
mov eax, edi
cdq
idiv ecx
mov eax, edx
ret
test_1:
mov eax, edi
mov rcx, rax
mov rdx, rax
sal rcx, 6
sal rdx, 19
add rdx, rcx
add rax, rdx
mov edx, edi
shr rax, 32
sub edx, eax
shr edx
add eax, edx
shr eax, 12
mov edx, eax
sal edx, 13
sub edx, eax
sub edi, edx
mov eax, edi
ret
test_2:
mov eax, edi
shr edi, 13
and eax, 8191
add eax, edi
mov edx, eax
sar eax, 13
and edx, 8191
add eax, edx
lea edx, [rax-8191]
cmp eax, 8191
cmovge eax, edx
ret
/* Darwin x86_64 LLVM Clang */
$ cc --version
Apple LLVM version 7.3.0 (clang-703.0.29)
_test_0:
mov dword ptr [rsp - 4], esi
xor edx, edx
mov eax, edi
div dword ptr [rsp - 4]
mov eax, edx
ret
_test_1:
mov eax, edi
imul rax, rax, 524353
shr rax, 32
mov ecx, edi
sub ecx, eax
shr ecx
add ecx, eax
shr ecx, 12
imul eax, ecx, 8191
sub edi, eax
mov eax, edi
ret
_test_2:
mov eax, edi
and eax, 8191
mov ecx, edi
shr ecx, 13
add eax, ecx
add ecx, edi
and ecx, 8191
shr eax, 13
lea edx, [rcx + rax]
cmp edx, 8190
lea eax, [rcx + rax - 8191]
cmovbe eax, edx
ret