71 views

Skip to first unread message

Nov 26, 1999, 3:00:00 AM11/26/99

to

you've probably already seen something like:

int gcd(int a, int b)

{

int r;

if(a == 0)

return b;

while(b != 0)

{

r = a % b;

a = b;

b = r;

}

return a;

}

which calculates the "greatest common divisor" of a and b

e.g. gcd(6,8) = 2, gcd(147896325,123698745) = 45

now, there is a "binary" version of that method too

it works like this:

1. extract the greatest common power of 2 of a and b

then loop through:

2. a even, b uneven => a = a / 2

3. a uneven, b even => b = b / 2

4. a uneven, b uneven => subtract the smaller number from the larger one

until either a = 0 or b = 0

take the one that is not 0, multiply by the power of 2 from 1. and return that value as being gcd(a,b)

finally, I implemented this in asm (obviously :) and...

it looks like there may be some room for improvement... as usual

it already runs way faster (sort of) than the above C version which requires divison

still, someone in here may yet see what I didn't

all comments welcome

Yours

P

; extern int gcdAsm(int a, int b)

;

; computes gcd(a,b) according to:

; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2)

; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b)

; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2)

; 4. a uneven, b uneven: a>b ? a -= b : b -= a, i.e. gcd(a,b) = gcd(min(a,b),max(a,b) - min(a,b))

; do 1., repeat 2. - 4. until a = 0 or b = 0

BITS 32

GLOBAL _gcdAsm

SECTION .text

_gcdAsm:

push ebp

mov ebp,esp

push ebx

push ecx

push edx

push edi

push esi

mov eax,[ebp + 8] ; eax = a (0 <= a <= 2^31 - 1 (= 2 147 483 647))

mov ebx,[ebp + 12] ; ebx = b (0 <= b <= 2^31 - 1)

xor edi,edi ; "final" shift adjustment (n.b.: gcd(a,b) = 2 * gcd(a/2,b/2))

test eax,eax ; if a = 0 we're done

je done

test ebx,ebx ; if b = 0, we're done

jne bothEven

done:

add eax,ebx ; either a = 0 or b = 0, return the other one

pop esi ; restore all and return

pop edi

pop edx

pop ecx

pop ebx

mov esp,ebp

pop ebp

ret ; eax = gcd(a,b)

bothEven:

test eax,1 ; a even ?

jnz main ; no

test ebx,1 ; b even ?

jnz main ; no

shr eax,1 ; both even: a = a / 2, b = b / 2

add edi,1 ; adjust shift count

shr ebx,1

cmp eax,0 ; if a = 0 we're done

je final

cmp ebx,0 ; if b = 0 we're done

jne bothEven

add eax,ebx ; not really needed since ebx = 0... :)

mov ecx,edi ; btw, how often does it happen that both a and b are powers of 2 ?

shl eax,cl ; adjust with shift count

pop esi ; restore all and return

pop edi

pop edx

pop ecx

pop ebx

mov esp,ebp

pop ebp

ret ; eax = gcd(a,b)

main:

; once here, we're sure that a and b won't ever be even both at the same time (!)

cmp eax,1 ; a = 0 ?

jc final ; yes, we're done

cmp ebx,1 ; b = 0 ?

jc final ; yes, we're done

mainLoop:

test eax,1 ; while a even ...

jnz aUneven

shr eax,1 ; ... a = a / 2

jnz mainLoop

add eax,ebx ; a = 0, we're done

mov ecx,edi ; shift count as computed in "bothEven"-loop

shl eax,cl ; adjust by shift count

pop esi ; restore all and return

pop edi

pop edx

pop ecx

pop ebx

mov esp,ebp

pop ebp

ret ; eax = gcd(a,b)

aUneven:

test ebx,1 ; while b even ...

jnz bUneven

shr ebx,1 ; ... b = b / 2

jnz aUneven ; eax hasn't changed, no need to jump back to mainLoop

add eax,ebx ; b = 0, we're done

mov ecx,edi ; shift count as computed in "bothEven"-loop

shl eax,cl ; adjust by shift count

pop esi ; restore all and return

pop edi

pop edx

pop ecx

pop ebx

mov esp,ebp

pop ebp

ret ; eax = gcd(a,b)

bUneven:

mov edx,ebx ; both a and b uneven

sub edx,eax ; compute:

sbb ecx,ecx ; a-new = min(a,b)

and ecx,edx ; b-new = max(a,b) - min(a,b)

add eax,ecx ; i.e. a branchless version of:

sub ebx,ecx ; if a > b then a = a - b else b = b - a

sub ebx,eax ; doesn't work with negative numbers, but that's excluded by definition :)

jnz mainLoop

final:

add eax,ebx ; either a = 0 or b = 0, so we're done

mov ecx,edi ; shift count as computed in "bothEven"-loop

shl eax,cl ; adjust by shift count

pop esi ; restore all and return

pop edi

pop edx

pop ecx

pop ebx

mov esp,ebp

pop ebp

ret ; eax = gcd(a,b)

Nov 26, 1999, 3:00:00 AM11/26/99

to

PatD wrote:

>

> you've probably already seen something like:

>

> int gcd(int a, int b)

> {

> int r;

>

> if(a == 0)

> return b;

> while(b != 0)

> {

> r = a % b;

> a = b;

> b = r;

> }

>

> return a;

> }

>

> which calculates the "greatest common divisor" of a and b

> e.g. gcd(6,8) = 2, gcd(147896325,123698745) = 45

>

> now, there is a "binary" version of that method too

>

> it works like this:

> 1. extract the greatest common power of 2 of a and b

> then loop through:

> 2. a even, b uneven => a = a / 2

> 3. a uneven, b even => b = b / 2

> 4. a uneven, b uneven => subtract the smaller number from the larger one

> until either a = 0 or b = 0

> take the one that is not 0, multiply by the power of 2 from 1. and return that value as being gcd(a,b)

>

> finally, I implemented this in asm (obviously :) and...

> it looks like there may be some room for improvement... as usual

>

> you've probably already seen something like:

>

> int gcd(int a, int b)

> {

> int r;

>

> if(a == 0)

> return b;

> while(b != 0)

> {

> r = a % b;

> a = b;

> b = r;

> }

>

> return a;

> }

>

> which calculates the "greatest common divisor" of a and b

> e.g. gcd(6,8) = 2, gcd(147896325,123698745) = 45

>

> now, there is a "binary" version of that method too

>

> it works like this:

> 1. extract the greatest common power of 2 of a and b

> then loop through:

> 2. a even, b uneven => a = a / 2

> 3. a uneven, b even => b = b / 2

> 4. a uneven, b uneven => subtract the smaller number from the larger one

> until either a = 0 or b = 0

> take the one that is not 0, multiply by the power of 2 from 1. and return that value as being gcd(a,b)

>

> finally, I implemented this in asm (obviously :) and...

> it looks like there may be some room for improvement... as usual

Looking back at your algorithm, it is obviously the same as the C

version, except using repeated subtraction instead of div/mod

operations.

A tiny speedup could be had by keeping the shift count in ECX instead of

EDI, avoiding a copy before the SHL operation, but this is only zero or

one cycle.

Step 1 could be speeded up by OR'ing together the two values, and then

extracting the number of trailing zeros, using a BSF operation (which is

finally fast on a P6):

mov edx,eax

or edx,ebx

bsf ecx,edx

shr eax,cl

shr ebx,cl

This gets rid of a bunch of branch operations.

The remaining three steps could be handled like this:

if even(A),odd(B), save a new value for A = A >> 1

if odd(A), even(B), save a new value for B = B >> 1

if odd(A), odd(B), save new values for both A & B

mov ebp,eax ; Save a copy of A & B

mov edi,ebx

; Using your branchless code for even A & B:

mov edx,ebx

sub edx,eax

sbb esi,esi

and esi,edx

add eax,esi

jz A_is_zero

sub ebx,esi

sub ebx,eax

jz B_is_zero

Placing the exit tests here save having to do a pair of explicit tests

at the bottom, now we can rotate the loop to make this the bottom of the

loop instead, avoiding all branches except the loop exit!

mov [SaveA_odd_odd],eax

mov [SaveB_odd_odd],ebx

; Retrive old values of A & B

mov eax,ebp

mov ebx,esi

shr eax,1

sbb ebp,ebp

shr ebx,1

mov [SaveA_even_odd],eax

adc ebp,ebp

mov [SaveB_odd_even],ebx

This code above is intentially unoptimized to make it clearer.

It should be pipelined, while avoiding the extra reg-reg moves.

The idea is to calculate all the possible new values in parallel, saving

them to carefully aligned memory slots, and then use a pair of table

lookups based on the two parity bits shifted out of A & B:

; EBP can now have 4 possible values:

; even(A) + even(B) -> 0 ;; This cannot happen, since we did the BSF

first!

; even(A) + odd(B) -> 1

; odd(A) + even(B) ->-2

; odd(A) + odd(B) ->-1

mov eax,A_Table_Start[ebp*4 + 8]

mov ebx,B_Table_Start[ebp*4 + 8]

This makes for an inner loop with 21 instructions, but no branches

except for the loop exit, so it should run in 8-11 cycles. This is much

faster than a DIV, but it will still be slower if the inputs consists of

one very large and another very small number!

Terje

PS. I don't remember exactly what BSF returns, and the case where both

inputs are zero must be specialcased.

--

- <Terje.M...@hda.hydro.com>

Using self-discipline, see http://www.eiffel.com/discipline

"almost all programming can be viewed as an exercise in caching"

Nov 26, 1999, 3:00:00 AM11/26/99

to

Your method is interesting, but if we look at stages 2-4 we see that what

you are doing is a highly inefficient method of implementing the modulus

function. Your method may be useful when implementing gcd() for

infinite-precision numbers (division is a _very_ expensive operation in this

case) but when dealing with numbers that fit inside the CPU's registers -

division is the way to go for a general-purpose routine.

you are doing is a highly inefficient method of implementing the modulus

function. Your method may be useful when implementing gcd() for

infinite-precision numbers (division is a _very_ expensive operation in this

case) but when dealing with numbers that fit inside the CPU's registers -

division is the way to go for a general-purpose routine.

A few criticisms of your code:

1. Your code uses additions where an 'inc' operation would work just as

well.

2. Comparisons with zero should be implemented as cmp [reg],0 or - better -

test [reg],[reg].

3. The code would look much cleaner if you had a single exit point from the

function.

4. Your code jumps about too much, and the jumps cannot always be predicted.

Try to rearrange the code so that the number of jumps is smaller.

--

Daniel Pfeffer

--------------

Remove 'nospam' from my address in order to contact me directly

PatD <Pa...@govnet.com> wrote in message

news:81kp5q$7id$1...@autumn.news.rcn.net...

>

> you've probably already seen something like:

>

> int gcd(int a, int b)

> {

> int r;

>

> if(a == 0)

> return b;

> while(b != 0)

> {

> r = a % b;

> a = b;

> b = r;

> }

>

> return a;

> }

>

> which calculates the "greatest common divisor" of a and b

> e.g. gcd(6,8) = 2, gcd(147896325,123698745) = 45

>

> now, there is a "binary" version of that method too

>

> it works like this:

> 1. extract the greatest common power of 2 of a and b

> then loop through:

> 2. a even, b uneven => a = a / 2

> 3. a uneven, b even => b = b / 2

> 4. a uneven, b uneven => subtract the smaller number from the larger one

> until either a = 0 or b = 0

> take the one that is not 0, multiply by the power of 2 from 1. and return

that value as being gcd(a,b)

>

>

> finally, I implemented this in asm (obviously :) and...

> it looks like there may be some room for improvement... as usual

>

Nov 26, 1999, 3:00:00 AM11/26/99

to

In article <81kp5q$7id$1...@autumn.news.rcn.net>,

Pa...@ResistanceIsFutile.com wrote:

>

> you've probably already seen something like:

>

> int gcd(int a, int b)

> {

Pa...@ResistanceIsFutile.com wrote:

>

> you've probably already seen something like:

>

> int gcd(int a, int b)

> {

[snip]

>

> ; extern int gcdAsm(int a, int b)

> ;

> ; computes gcd(a,b) according to:

> ; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2)

> ; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b)

> ; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2)

> ; 4. a uneven, b uneven: a>b ? a -= b : b -= a, i.e. gcd(a,b) =

gcd(min(a,b),max(a,b) - min(a,b))

> ; do 1., repeat 2. - 4. until a = 0 or b = 0

>

[snip]

Cor, are all those PUSH TEST SHL JMP SUB ADD POP faster than this ?

I know divide can be done by shift and subtract, but

it looks like a lot of overhead to reduce the number of subtracts

performed. Are the jumps so bad in the odd,odd case?

int gcdasm(int a, int b)

{

__asm {

mov eax,dword ptr [esp+4] ;//a cheaper than EBP relative

mov ecx,dword ptr [esp+8] ;//b

L0:

cmp eax,ecx

je L2

jg L1

xchg eax,ecx

L1:

sub eax,ecx

jmp L0

L2:

}

}

Can't you keep the shift count in CL (ECX) rather than EDI ?

kind regards,

David Earlam

"In the country of the blind the one eyed man is king" (Desiderius

Erasmus c. 1465-1536)

"Beauty problems are redefined, the doorbells do not ring" (John

Cooper-Clarke 1959-)

David....@arm.com (speaking for myself)

Sent via Deja.com http://www.deja.com/

Before you buy.

Nov 28, 1999, 3:00:00 AM11/28/99

to

some people have suggested (or asked why I didn't use)

subtraction-only methods...

why didn't I use it ?

easy, because it's bad !

a nice implementation of that idea can be seen at Paul Hsieh's website:

http://www.azillionmonkeys.com/qed/asmexample.html

it goes like this:

gcd: add edx,eax

jne L1

mov eax,1

jmp L2

L1: mov ebx,edx ; 1 dep: edx

add eax,eax ; 1

sub ebx,eax ; 2 dep: eax

shr eax,1 ; 2

mov ecx,ebx ; 3 dep: ebx

sar ebx,1fH ; 3

and ebx,ecx ; 4 dep: ecx

add eax,ebx ; 5 dep: ebx

sub edx,eax ; 6 dep: eax

test eax,eax ; 6

jne L1 ; 6

mov eax,edx

L2:

(see the page for details)

now, remember the "original" C version ?

int gcd(int a, int b)

{

int r;

if(a == 0)

return b;

while(b != 0)

{

r = a % b;

a = b;

b = r;

}

return a;

}

what's the advantage of that one over a subtraction-only method ?

there's basically only one: speed !

imagine gcd(1,1000000) ! how many iterations are needed for that one ?

that's the point

the C version may use slow operations, but it only takes a couple of runs

to come up with the result

btw, that algorithm is some 2000 years old ! and still perfectly valid :)

now, on to the real stuff:

while I was toying around with a branchless array-version, I finally

decided to print out all intermediate values

and, after some long look into that...

(boring I might add... pages of pages of numbers only :)

noticed that a couple of steps

can simply be ...stepped over !

first of all, "simplifying" by 2 should be done as much as possible

the smaller the numbers, the faster we get a result

second: there's no need to work with max(a,b) !

it's enough to use max(a,b) - min(a,b) only

even better, no storage space for temporary values is needed

one can simply calculate the next values immediately ! :)

in short, all we have to do is:

get min(a,b)

get max(a,b)

calculate max(a,b) - min(a,b)

simplify those 2 values by 2 as much as possible

loop until max(a,b) - min(a,b) = 0

that's it already

back to our gcd(1,1000000) example

how many iterations do we need ?

1 1 000 000

1 15 625

1 1 953

1 61

1 15

1 7

1 3

1 1

1 0

done ! amazing, isn't it ? :)

btw, worst case input for that alg. is actually 2 consecutive Fibonacci numbers

(Fn = Fn-1 + Fn-2, F0=1, F1=1)

e.g. 89 and 55

the original c version does:

(89,55) -> (55,34) -> (34,21) -> (21,13) -> (13,8) -> (8,5) -> (5,3) -> (3,2) -> (2,1) -> (1,0)

"my" version:

(89,55) -> (55,17) -> (17,19) -> (17,1) -> (1,1) -> (1,0)

:)

(subtraction-only is left as an exercice for the reader)

finally, combining all of the above, some (lots) of ideas from replies

and a bit of thinking of my own, lead to this (final ?) asm version,

(I know, you've all been waiting for this, but I felt like boring you with details first :)

; int gcdAsm(int a, int b)

;

; computes gcd(a,b) according to:

; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2), and remember how often this happened

; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b)

; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2)

; 4. a uneven, b uneven: a>b ? a -= b : b -= a, i.e. gcd(a,b) = gcd(min(a,b),max(a,b) - min(a,b))

; do 1., repeat 2. - 4. until a = 0 or b = 0

; return (a + b) corrected by the remembered value from 1.

BITS 32

GLOBAL _gcdAsm

SECTION .text

_gcdAsm:

push ebp

mov ebp,esp

push ebx

push ecx

push edx

push edi

mov eax,[ebp + 8] ; eax = a (0 <= a <= 2^31 - 1 (= 2 147 483 647))

mov ebx,[ebp + 12] ; ebx = b (0 <= b <= 2^31 - 1)

; by definition: gcd(a,0) = a, gcd(0,b) = b, gcd(0,0) = 1 !

mov ecx,eax

or ecx,ebx

bsf ecx,ecx ; extract the greatest common power of 2 of a and b...

jnz notBoth0

mov eax,1 ; if a = 0 and b = 0, return 1

jmp done

notBoth0:

mov edi,ecx ; ...and remember it for later use

test eax,eax

jnz aNot0

mov eax,ebx ; if a = 0, return b

jmp done

aNot0:

test ebx,ebx

jz done ; if b = 0, return a

bsf ecx,eax ; "simplify" a as much as possible

shr eax,cl

bsf ecx,ebx ; "simplify" b as much as possible

shr ebx,cl

mainLoop:

mov ecx,ebx

sub ecx,eax ; b - a

sbb edx,edx ; edx = 0 if b >= a or -1 if a > b

and ecx,edx ; ecx = 0 if b >= a or b - a if a > b

add eax,ecx ; a-new = min(a,b)

sub ebx,ecx ; b-new = max(a,b)

sub ebx,eax ; we're now sure that that difference is >= 0

bsf ecx,eax ; "simplify" as much as possible by 2

shr eax,cl

bsf ecx,ebx ; "simplify" as much as possible by 2

shr ebx,cl

jnz mainLoop ; keep looping until ebx = 0

mov ecx,edi ; multiply back with the common power of 2

shl eax,cl

done:

pop edi

pop edx

pop ecx

pop ebx

mov esp,ebp

pop ebp

ret ; eax = gcd(a,b)

this should even satisfy those that asked for "cleaner" code with less exit-points :)

("bsf" is really only an option on ppro, p2 or better

but, since this is supposed to run on p2... :)

all comments, as usual, welcome

Yours

P

" A child of five could understand this, fetch me a child of five "

Groucho Marx

Nov 28, 1999, 3:00:00 AM11/28/99

to

addendum:

since eax actually only ever holds odd values anyway

the bsf/shr combo is pretty much pointless :)

the "inner" loop should look like this:

mainLoop:

mov ecx,ebx

sub ecx,eax ; b - a

sbb edx,edx ; edx = 0 if b >= a or -1 if a > b

and ecx,edx ; ecx = 0 if b >= a or b - a if a > b

add eax,ecx ; a-new = min(a,b)

sub ebx,ecx ; b-new = max(a,b)

sub ebx,eax ; we're now sure that that difference is >= 0

bsf ecx,ebx ; "simplify" as much as possible by 2

shr ebx,cl

jnz mainLoop ; keep looping until ebx = 0

all comments, as usual, welcome

Reply all

Reply to author

Forward

0 new messages

Search

Clear search

Close search

Google apps

Main menu