19 views

Skip to first unread message

Aug 25, 2022, 2:34:47 PM8/25/22

to

On Thu, 25 Aug 2022 10:17:26 -0700 (PDT)

Michał Kasprzak <siarc...@gmail.com> wrote:

> Look at the xor r10d, r10d instruction in the heapsort.asm file.

> This known technique of limiting code length by working on the lower 32 bits of 64-bit registers, which works great for rax-rdx registers, unfortunately does not work for extended registers r8-r15. Even the 32-bit xor r10d, r10d requires a prefix byte and the entire instruction is also 3 bytes long.

> It's a pity, but we can swap the roles of the r10 and rax registers in the sorting loop part of the code to be able to write xor eax, eax (which is 2 bytes long) instead of xor r10d, r10d and get 1 byte shorter code :)

> And that's exactly what I did in the patched version of heapsort.asm below.

> Now the revised assembled code is 181 bytes long. It is really short! When you honestly compare this length with the numbers you saw earlier in this thread remember that this code has a sift code inlined in two places for quicker operation.

>

> Who next of you will shorten this code more or find some improvement to this code? :)

>

You might try an asm group

I'll try a xpost to clax

> ---------- start of file: heapsort.asm, cut below this line ----------

> BITS 64

> ; rdi = array's address

> ; rsi = array's number of elements = n

> heapsort: mov rax, rsi

> shr rax, 1

> ; nothing to sort if there is 0 or 1 element in the array

> jz the_end

> ; now we are sure that always n>=2

> ; r10 = index of the last parent node

> ; r10 is also the counter of the heap building loop

> lea r10, [rax-1]

> ; rbx = index of the left child of the last parent node

> ; rbx is also index of the left child of r10 node in the loop

> lea rbx, [rax+rax-1]

> ; heap building loop

> ; sift the counter node's value stored in r9 to the proper position

> ; r9 = array[r10]

> build_loop: mov r9, qword [rdi+8*r10]

> ; r11 is index of target place for sifted values

> ; we start sifting from the counter node's index

> mov r11, r10

> ; rax is the index of a current child of r11 node

> ; we start r11 with the index of the left child

> mov rax, rbx

> jmp sift1_while

> ; we will find the child with bigger value and store its index in rax

> ; we will load this bigger value to rdx

> ; rcx holds the index of the right child

> sift1: lea rcx, [rax+1]

> mov rdx, qword [rdi+8*rax]

> ; note that the right child may not exist

> ; then we consider the left child to be bigger

> cmp rcx, rsi

> jnb sift1_check

> ; we load the right child node's value to r8

> mov r8, qword [rdi+8*rcx]

> ; if the right child node's value is bigger than the left child's

> cmp r8, rdx

> jle sift1_check

> ; then we will proceed the right child node

> mov rdx, r8

> mov rax, rcx

> ; compare the values of the current child node with the sifting node

> sift1_check: cmp r9, rdx

> ; if the sifting node's value is bigger then we end sifting

> jnl build_next

> ; we store the child's value into the parent node's place

> mov qword [rdi+8*r11], rdx

> ; we make the child's node the new parent's node

> mov r11, rax

> lea rax, [rax+rax+1]

> ; do sift if there is still at least one child

> sift1_while: cmp rax, rsi

> jb sift1

> ; we store sifted value into the target place and repeat the loop

> build_next: mov qword [rdi+8*r11], r9

> sub rbx, 2

> sub r10, 1

> jnc build_loop

> ; the sorting loop starts with n-1

> ; r10 will hold the counter of the sorting loop

> lea r10, [rsi-1]

> ; the sorting loop

> ; rdx = array[0]

> ; r9 = array[counter]

> sort_loop: mov rdx, qword [rdi]

> mov r9, qword [rdi+8*r10]

> ; we will sift the node rax=0 which value is in r9

> xor eax, eax

> ; we store array[counter]=array[0]

> mov qword [rdi+8*r10], rdx

> ; now we will store in rdx the left child node's index

> mov edx, 1

> jmp sift2_while

> ; sift2 works like sift1 with changed registers

> ; we will find the child with bigger value and store its index in rdx

> ; we will load this bigger value to rsi

> ; rcx holds the index of the right child

> sift2: lea rcx, [rdx+1]

> mov rsi, qword [rdi+8*rdx]

> ; note that the right child may not exist

> ; then we consider the left child to be bigger

> cmp rcx, r10

> jnb sift2_check

> ; we load the right child node's value to r8

> mov r8, qword [rdi+8*rcx]

> ; if the right child node's value is bigger than the left child's

> cmp r8, rsi

> jle sift2_check

> ; then we will proceed the right child node

> mov rsi, r8

> mov rdx, rcx

> ; compare the values of the current child node with the sifting node

> sift2_check: cmp r9, rsi

> ; if the sifting node's value is bigger then we end sifting

> jnl sort_next

> ; we store the child's value into the parent node's place

> mov qword [rdi+8*rax], rsi

> ; we make the child's node the new parent's node

> mov rax, rdx

> lea rdx, [rdx+rdx+1]

> ; do sift if there is still at least one child

> sift2_while: cmp rdx, r10

> jb sift2

> ; we store sifted value into the target place

> sort_next: mov qword [rdi+8*rax], r9

> ; next step of the sorting loop

> sub r10, 1

> ; the sorting loop ends with 1

> jnz sort_loop

> the_end:

> ---------- end of file: heapsort.asm, cut above this line ----------

--

Bah, and indeed Humbug.

Michał Kasprzak <siarc...@gmail.com> wrote:

> Look at the xor r10d, r10d instruction in the heapsort.asm file.

> This known technique of limiting code length by working on the lower 32 bits of 64-bit registers, which works great for rax-rdx registers, unfortunately does not work for extended registers r8-r15. Even the 32-bit xor r10d, r10d requires a prefix byte and the entire instruction is also 3 bytes long.

> It's a pity, but we can swap the roles of the r10 and rax registers in the sorting loop part of the code to be able to write xor eax, eax (which is 2 bytes long) instead of xor r10d, r10d and get 1 byte shorter code :)

> And that's exactly what I did in the patched version of heapsort.asm below.

> Now the revised assembled code is 181 bytes long. It is really short! When you honestly compare this length with the numbers you saw earlier in this thread remember that this code has a sift code inlined in two places for quicker operation.

>

> Who next of you will shorten this code more or find some improvement to this code? :)

>

You might try an asm group

I'll try a xpost to clax

> ---------- start of file: heapsort.asm, cut below this line ----------

> BITS 64

> ; rdi = array's address

> ; rsi = array's number of elements = n

> heapsort: mov rax, rsi

> shr rax, 1

> ; nothing to sort if there is 0 or 1 element in the array

> jz the_end

> ; now we are sure that always n>=2

> ; r10 = index of the last parent node

> ; r10 is also the counter of the heap building loop

> lea r10, [rax-1]

> ; rbx = index of the left child of the last parent node

> ; rbx is also index of the left child of r10 node in the loop

> lea rbx, [rax+rax-1]

> ; heap building loop

> ; sift the counter node's value stored in r9 to the proper position

> ; r9 = array[r10]

> build_loop: mov r9, qword [rdi+8*r10]

> ; r11 is index of target place for sifted values

> ; we start sifting from the counter node's index

> mov r11, r10

> ; rax is the index of a current child of r11 node

> ; we start r11 with the index of the left child

> mov rax, rbx

> jmp sift1_while

> ; we will find the child with bigger value and store its index in rax

> ; we will load this bigger value to rdx

> ; rcx holds the index of the right child

> sift1: lea rcx, [rax+1]

> mov rdx, qword [rdi+8*rax]

> ; note that the right child may not exist

> ; then we consider the left child to be bigger

> cmp rcx, rsi

> jnb sift1_check

> ; we load the right child node's value to r8

> mov r8, qword [rdi+8*rcx]

> ; if the right child node's value is bigger than the left child's

> cmp r8, rdx

> jle sift1_check

> ; then we will proceed the right child node

> mov rdx, r8

> mov rax, rcx

> ; compare the values of the current child node with the sifting node

> sift1_check: cmp r9, rdx

> ; if the sifting node's value is bigger then we end sifting

> jnl build_next

> ; we store the child's value into the parent node's place

> mov qword [rdi+8*r11], rdx

> ; we make the child's node the new parent's node

> mov r11, rax

> lea rax, [rax+rax+1]

> ; do sift if there is still at least one child

> sift1_while: cmp rax, rsi

> jb sift1

> ; we store sifted value into the target place and repeat the loop

> build_next: mov qword [rdi+8*r11], r9

> sub rbx, 2

> sub r10, 1

> jnc build_loop

> ; the sorting loop starts with n-1

> ; r10 will hold the counter of the sorting loop

> lea r10, [rsi-1]

> ; the sorting loop

> ; rdx = array[0]

> ; r9 = array[counter]

> sort_loop: mov rdx, qword [rdi]

> mov r9, qword [rdi+8*r10]

> ; we will sift the node rax=0 which value is in r9

> xor eax, eax

> ; we store array[counter]=array[0]

> mov qword [rdi+8*r10], rdx

> ; now we will store in rdx the left child node's index

> mov edx, 1

> jmp sift2_while

> ; sift2 works like sift1 with changed registers

> ; we will find the child with bigger value and store its index in rdx

> ; we will load this bigger value to rsi

> ; rcx holds the index of the right child

> sift2: lea rcx, [rdx+1]

> mov rsi, qword [rdi+8*rdx]

> ; note that the right child may not exist

> ; then we consider the left child to be bigger

> cmp rcx, r10

> jnb sift2_check

> ; we load the right child node's value to r8

> mov r8, qword [rdi+8*rcx]

> ; if the right child node's value is bigger than the left child's

> cmp r8, rsi

> jle sift2_check

> ; then we will proceed the right child node

> mov rsi, r8

> mov rdx, rcx

> ; compare the values of the current child node with the sifting node

> sift2_check: cmp r9, rsi

> ; if the sifting node's value is bigger then we end sifting

> jnl sort_next

> ; we store the child's value into the parent node's place

> mov qword [rdi+8*rax], rsi

> ; we make the child's node the new parent's node

> mov rax, rdx

> lea rdx, [rdx+rdx+1]

> ; do sift if there is still at least one child

> sift2_while: cmp rdx, r10

> jb sift2

> ; we store sifted value into the target place

> sort_next: mov qword [rdi+8*rax], r9

> ; next step of the sorting loop

> sub r10, 1

> ; the sorting loop ends with 1

> jnz sort_loop

> the_end:

> ---------- end of file: heapsort.asm, cut above this line ----------

--

Bah, and indeed Humbug.

Reply all

Reply to author

Forward

0 new messages

Search

Clear search

Close search

Google apps

Main menu