int gcd(int a, int b)
{
int r;
if(a == 0)
return b;
while(b != 0)
{
r = a % b;
a = b;
b = r;
}
return a;
}
which calculates the "greatest common divisor" of a and b
e.g. gcd(6,8) = 2, gcd(147896325,123698745) = 45
now, there is a "binary" version of that method too
it works like this:
1. extract the greatest common power of 2 of a and b
then loop through:
2. a even, b uneven => a = a / 2
3. a uneven, b even => b = b / 2
4. a uneven, b uneven => subtract the smaller number from the larger one
until either a = 0 or b = 0
take the one that is not 0, multiply by the power of 2 from 1. and return that value as being gcd(a,b)
finally, I implemented this in asm (obviously :) and...
it looks like there may be some room for improvement... as usual
it already runs way faster (sort of) than the above C version which requires divison
still, someone in here may yet see what I didn't
all comments welcome
Yours
P
; extern int gcdAsm(int a, int b)
;
; computes gcd(a,b) according to:
; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2)
; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b)
; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2)
; 4. a uneven, b uneven: a>b ? a -= b : b -= a, i.e. gcd(a,b) = gcd(min(a,b),max(a,b) - min(a,b))
; do 1., repeat 2. - 4. until a = 0 or b = 0
BITS 32
GLOBAL _gcdAsm
SECTION .text
_gcdAsm:
push ebp
mov ebp,esp
push ebx
push ecx
push edx
push edi
push esi
mov eax,[ebp + 8] ; eax = a (0 <= a <= 2^31 - 1 (= 2 147 483 647))
mov ebx,[ebp + 12] ; ebx = b (0 <= b <= 2^31 - 1)
xor edi,edi ; "final" shift adjustment (n.b.: gcd(a,b) = 2 * gcd(a/2,b/2))
test eax,eax ; if a = 0 we're done
je done
test ebx,ebx ; if b = 0, we're done
jne bothEven
done:
add eax,ebx ; either a = 0 or b = 0, return the other one
pop esi ; restore all and return
pop edi
pop edx
pop ecx
pop ebx
mov esp,ebp
pop ebp
ret ; eax = gcd(a,b)
bothEven:
test eax,1 ; a even ?
jnz main ; no
test ebx,1 ; b even ?
jnz main ; no
shr eax,1 ; both even: a = a / 2, b = b / 2
add edi,1 ; adjust shift count
shr ebx,1
cmp eax,0 ; if a = 0 we're done
je final
cmp ebx,0 ; if b = 0 we're done
jne bothEven
add eax,ebx ; not really needed since ebx = 0... :)
mov ecx,edi ; btw, how often does it happen that both a and b are powers of 2 ?
shl eax,cl ; adjust with shift count
pop esi ; restore all and return
pop edi
pop edx
pop ecx
pop ebx
mov esp,ebp
pop ebp
ret ; eax = gcd(a,b)
main:
; once here, we're sure that a and b won't ever be even both at the same time (!)
cmp eax,1 ; a = 0 ?
jc final ; yes, we're done
cmp ebx,1 ; b = 0 ?
jc final ; yes, we're done
mainLoop:
test eax,1 ; while a even ...
jnz aUneven
shr eax,1 ; ... a = a / 2
jnz mainLoop
add eax,ebx ; a = 0, we're done
mov ecx,edi ; shift count as computed in "bothEven"-loop
shl eax,cl ; adjust by shift count
pop esi ; restore all and return
pop edi
pop edx
pop ecx
pop ebx
mov esp,ebp
pop ebp
ret ; eax = gcd(a,b)
aUneven:
test ebx,1 ; while b even ...
jnz bUneven
shr ebx,1 ; ... b = b / 2
jnz aUneven ; eax hasn't changed, no need to jump back to mainLoop
add eax,ebx ; b = 0, we're done
mov ecx,edi ; shift count as computed in "bothEven"-loop
shl eax,cl ; adjust by shift count
pop esi ; restore all and return
pop edi
pop edx
pop ecx
pop ebx
mov esp,ebp
pop ebp
ret ; eax = gcd(a,b)
bUneven:
mov edx,ebx ; both a and b uneven
sub edx,eax ; compute:
sbb ecx,ecx ; a-new = min(a,b)
and ecx,edx ; b-new = max(a,b) - min(a,b)
add eax,ecx ; i.e. a branchless version of:
sub ebx,ecx ; if a > b then a = a - b else b = b - a
sub ebx,eax ; doesn't work with negative numbers, but that's excluded by definition :)
jnz mainLoop
final:
add eax,ebx ; either a = 0 or b = 0, so we're done
mov ecx,edi ; shift count as computed in "bothEven"-loop
shl eax,cl ; adjust by shift count
pop esi ; restore all and return
pop edi
pop edx
pop ecx
pop ebx
mov esp,ebp
pop ebp
ret ; eax = gcd(a,b)
Looking back at your algorithm, it is obviously the same as the C
version, except using repeated subtraction instead of div/mod
operations.
A tiny speedup could be had by keeping the shift count in ECX instead of
EDI, avoiding a copy before the SHL operation, but this is only zero or
one cycle.
Step 1 could be speeded up by OR'ing together the two values, and then
extracting the number of trailing zeros, using a BSF operation (which is
finally fast on a P6):
mov edx,eax
or edx,ebx
bsf ecx,edx
shr eax,cl
shr ebx,cl
This gets rid of a bunch of branch operations.
The remaining three steps could be handled like this:
if even(A),odd(B), save a new value for A = A >> 1
if odd(A), even(B), save a new value for B = B >> 1
if odd(A), odd(B), save new values for both A & B
mov ebp,eax ; Save a copy of A & B
mov edi,ebx
; Using your branchless code for even A & B:
mov edx,ebx
sub edx,eax
sbb esi,esi
and esi,edx
add eax,esi
jz A_is_zero
sub ebx,esi
sub ebx,eax
jz B_is_zero
Placing the exit tests here save having to do a pair of explicit tests
at the bottom, now we can rotate the loop to make this the bottom of the
loop instead, avoiding all branches except the loop exit!
mov [SaveA_odd_odd],eax
mov [SaveB_odd_odd],ebx
; Retrive old values of A & B
mov eax,ebp
mov ebx,esi
shr eax,1
sbb ebp,ebp
shr ebx,1
mov [SaveA_even_odd],eax
adc ebp,ebp
mov [SaveB_odd_even],ebx
This code above is intentially unoptimized to make it clearer.
It should be pipelined, while avoiding the extra reg-reg moves.
The idea is to calculate all the possible new values in parallel, saving
them to carefully aligned memory slots, and then use a pair of table
lookups based on the two parity bits shifted out of A & B:
; EBP can now have 4 possible values:
; even(A) + even(B) -> 0 ;; This cannot happen, since we did the BSF
first!
; even(A) + odd(B) -> 1
; odd(A) + even(B) ->-2
; odd(A) + odd(B) ->-1
mov eax,A_Table_Start[ebp*4 + 8]
mov ebx,B_Table_Start[ebp*4 + 8]
This makes for an inner loop with 21 instructions, but no branches
except for the loop exit, so it should run in 8-11 cycles. This is much
faster than a DIV, but it will still be slower if the inputs consists of
one very large and another very small number!
Terje
PS. I don't remember exactly what BSF returns, and the case where both
inputs are zero must be specialcased.
--
- <Terje.M...@hda.hydro.com>
Using self-discipline, see http://www.eiffel.com/discipline
"almost all programming can be viewed as an exercise in caching"
A few criticisms of your code:
1. Your code uses additions where an 'inc' operation would work just as
well.
2. Comparisons with zero should be implemented as cmp [reg],0 or - better -
test [reg],[reg].
3. The code would look much cleaner if you had a single exit point from the
function.
4. Your code jumps about too much, and the jumps cannot always be predicted.
Try to rearrange the code so that the number of jumps is smaller.
--
Daniel Pfeffer
--------------
Remove 'nospam' from my address in order to contact me directly
PatD <Pa...@govnet.com> wrote in message
news:81kp5q$7id$1...@autumn.news.rcn.net...
>
> you've probably already seen something like:
>
> int gcd(int a, int b)
> {
> int r;
>
> if(a == 0)
> return b;
> while(b != 0)
> {
> r = a % b;
> a = b;
> b = r;
> }
>
> return a;
> }
>
> which calculates the "greatest common divisor" of a and b
> e.g. gcd(6,8) = 2, gcd(147896325,123698745) = 45
>
> now, there is a "binary" version of that method too
>
> it works like this:
> 1. extract the greatest common power of 2 of a and b
> then loop through:
> 2. a even, b uneven => a = a / 2
> 3. a uneven, b even => b = b / 2
> 4. a uneven, b uneven => subtract the smaller number from the larger one
> until either a = 0 or b = 0
> take the one that is not 0, multiply by the power of 2 from 1. and return
that value as being gcd(a,b)
>
>
> finally, I implemented this in asm (obviously :) and...
> it looks like there may be some room for improvement... as usual
>
[snip]
>
> ; extern int gcdAsm(int a, int b)
> ;
> ; computes gcd(a,b) according to:
> ; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2)
> ; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b)
> ; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2)
> ; 4. a uneven, b uneven: a>b ? a -= b : b -= a, i.e. gcd(a,b) =
gcd(min(a,b),max(a,b) - min(a,b))
> ; do 1., repeat 2. - 4. until a = 0 or b = 0
>
[snip]
Cor, are all those PUSH TEST SHL JMP SUB ADD POP faster than this ?
I know divide can be done by shift and subtract, but
it looks like a lot of overhead to reduce the number of subtracts
performed. Are the jumps so bad in the odd,odd case?
int gcdasm(int a, int b)
{
__asm {
mov eax,dword ptr [esp+4] ;//a cheaper than EBP relative
mov ecx,dword ptr [esp+8] ;//b
L0:
cmp eax,ecx
je L2
jg L1
xchg eax,ecx
L1:
sub eax,ecx
jmp L0
L2:
}
}
Can't you keep the shift count in CL (ECX) rather than EDI ?
kind regards,
David Earlam
"In the country of the blind the one eyed man is king" (Desiderius
Erasmus c. 1465-1536)
"Beauty problems are redefined, the doorbells do not ring" (John
Cooper-Clarke 1959-)
David....@arm.com (speaking for myself)
Sent via Deja.com http://www.deja.com/
Before you buy.
some people have suggested (or asked why I didn't use)
subtraction-only methods...
why didn't I use it ?
easy, because it's bad !
a nice implementation of that idea can be seen at Paul Hsieh's website:
http://www.azillionmonkeys.com/qed/asmexample.html
it goes like this:
gcd: add edx,eax
jne L1
mov eax,1
jmp L2
L1: mov ebx,edx ; 1 dep: edx
add eax,eax ; 1
sub ebx,eax ; 2 dep: eax
shr eax,1 ; 2
mov ecx,ebx ; 3 dep: ebx
sar ebx,1fH ; 3
and ebx,ecx ; 4 dep: ecx
add eax,ebx ; 5 dep: ebx
sub edx,eax ; 6 dep: eax
test eax,eax ; 6
jne L1 ; 6
mov eax,edx
L2:
(see the page for details)
now, remember the "original" C version ?
int gcd(int a, int b)
{
int r;
if(a == 0)
return b;
while(b != 0)
{
r = a % b;
a = b;
b = r;
}
return a;
}
what's the advantage of that one over a subtraction-only method ?
there's basically only one: speed !
imagine gcd(1,1000000) ! how many iterations are needed for that one ?
that's the point
the C version may use slow operations, but it only takes a couple of runs
to come up with the result
btw, that algorithm is some 2000 years old ! and still perfectly valid :)
now, on to the real stuff:
while I was toying around with a branchless array-version, I finally
decided to print out all intermediate values
and, after some long look into that...
(boring I might add... pages of pages of numbers only :)
noticed that a couple of steps
can simply be ...stepped over !
first of all, "simplifying" by 2 should be done as much as possible
the smaller the numbers, the faster we get a result
second: there's no need to work with max(a,b) !
it's enough to use max(a,b) - min(a,b) only
even better, no storage space for temporary values is needed
one can simply calculate the next values immediately ! :)
in short, all we have to do is:
get min(a,b)
get max(a,b)
calculate max(a,b) - min(a,b)
simplify those 2 values by 2 as much as possible
loop until max(a,b) - min(a,b) = 0
that's it already
back to our gcd(1,1000000) example
how many iterations do we need ?
1 1 000 000
1 15 625
1 1 953
1 61
1 15
1 7
1 3
1 1
1 0
done ! amazing, isn't it ? :)
btw, worst case input for that alg. is actually 2 consecutive Fibonacci numbers
(Fn = Fn-1 + Fn-2, F0=1, F1=1)
e.g. 89 and 55
the original c version does:
(89,55) -> (55,34) -> (34,21) -> (21,13) -> (13,8) -> (8,5) -> (5,3) -> (3,2) -> (2,1) -> (1,0)
"my" version:
(89,55) -> (55,17) -> (17,19) -> (17,1) -> (1,1) -> (1,0)
:)
(subtraction-only is left as an exercice for the reader)
finally, combining all of the above, some (lots) of ideas from replies
and a bit of thinking of my own, lead to this (final ?) asm version,
(I know, you've all been waiting for this, but I felt like boring you with details first :)
; int gcdAsm(int a, int b)
;
; computes gcd(a,b) according to:
; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2), and remember how often this happened
; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b)
; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2)
; 4. a uneven, b uneven: a>b ? a -= b : b -= a, i.e. gcd(a,b) = gcd(min(a,b),max(a,b) - min(a,b))
; do 1., repeat 2. - 4. until a = 0 or b = 0
; return (a + b) corrected by the remembered value from 1.
BITS 32
GLOBAL _gcdAsm
SECTION .text
_gcdAsm:
push ebp
mov ebp,esp
push ebx
push ecx
push edx
push edi
mov eax,[ebp + 8] ; eax = a (0 <= a <= 2^31 - 1 (= 2 147 483 647))
mov ebx,[ebp + 12] ; ebx = b (0 <= b <= 2^31 - 1)
; by definition: gcd(a,0) = a, gcd(0,b) = b, gcd(0,0) = 1 !
mov ecx,eax
or ecx,ebx
bsf ecx,ecx ; extract the greatest common power of 2 of a and b...
jnz notBoth0
mov eax,1 ; if a = 0 and b = 0, return 1
jmp done
notBoth0:
mov edi,ecx ; ...and remember it for later use
test eax,eax
jnz aNot0
mov eax,ebx ; if a = 0, return b
jmp done
aNot0:
test ebx,ebx
jz done ; if b = 0, return a
bsf ecx,eax ; "simplify" a as much as possible
shr eax,cl
bsf ecx,ebx ; "simplify" b as much as possible
shr ebx,cl
mainLoop:
mov ecx,ebx
sub ecx,eax ; b - a
sbb edx,edx ; edx = 0 if b >= a or -1 if a > b
and ecx,edx ; ecx = 0 if b >= a or b - a if a > b
add eax,ecx ; a-new = min(a,b)
sub ebx,ecx ; b-new = max(a,b)
sub ebx,eax ; we're now sure that that difference is >= 0
bsf ecx,eax ; "simplify" as much as possible by 2
shr eax,cl
bsf ecx,ebx ; "simplify" as much as possible by 2
shr ebx,cl
jnz mainLoop ; keep looping until ebx = 0
mov ecx,edi ; multiply back with the common power of 2
shl eax,cl
done:
pop edi
pop edx
pop ecx
pop ebx
mov esp,ebp
pop ebp
ret ; eax = gcd(a,b)
this should even satisfy those that asked for "cleaner" code with less exit-points :)
("bsf" is really only an option on ppro, p2 or better
but, since this is supposed to run on p2... :)
all comments, as usual, welcome
Yours
P
" A child of five could understand this, fetch me a child of five "
Groucho Marx
since eax actually only ever holds odd values anyway
the bsf/shr combo is pretty much pointless :)
the "inner" loop should look like this:
mainLoop:
mov ecx,ebx
sub ecx,eax ; b - a
sbb edx,edx ; edx = 0 if b >= a or -1 if a > b
and ecx,edx ; ecx = 0 if b >= a or b - a if a > b
add eax,ecx ; a-new = min(a,b)
sub ebx,ecx ; b-new = max(a,b)
sub ebx,eax ; we're now sure that that difference is >= 0
bsf ecx,ebx ; "simplify" as much as possible by 2
shr ebx,cl
jnz mainLoop ; keep looping until ebx = 0
all comments, as usual, welcome