Does this strstr code seem okay?

DSF

unread,

Jul 30, 2014, 3:05:05 AM7/30/14

to

Hello, group!

I've been having some short-term memory problems lately and I would
appreciate it if you could look over this code to replace strstr and
see if I have "all my ducks in a row."

_TEXT segment public use32 'CODE'

CPROC dstrstrA ;, string1:DWORD, string2:DWORD
;
; char *dstrstrA(const char *str1, const char *str2)
;
push esi
push edi
push ebp
push ebx

mov edi, [esp+24]
mov esi, [esp+20]

; dstrlenda_asm takes edi as a pointer
; to the starting address of string
; and returns a pointer to the ending
; zero in edi and the length of the
; string (in characters) in ecx. The Z
; flag is set if the string is zero
; bytes long.
; Only eax, ecx, edi, and flags are
; affected.

; Get lengths for str2 in edx and
; str1 in ebx.
; If str2 is zero length, jmp to
; test str1 for zero length to see
; if they match.
mov ebp, edi
call dstrlenda_asm
jz isstr1zero
mov edx, ecx

; If str2 is zero length, this point
; would never be reached, so if str1
; is zero length, not found.
mov edi, esi
call dstrlenda_asm
jz notfound
mov ebx, ecx

; eax is used as an incremental counter
; for str1 for the comparison code
mov eax, esi

; If the length of str2 > the length of str1,
; str2 cannot possibly be found.
; Note that this also prevents cmpsb from
; overshooting the bounds of str1.
nextstr1:
cmp edx, ebx
ja notfound

; Test for str2 in str1 at current position
; eax. Exit if found. eax will contain
; starting position of str2 in str1
mov ecx, edx
mov esi, eax
mov edi, ebp
cld
repe cmpsb
je exit

; Increment search position in str1 and
; decrement number of characters left
; in str1 to search.
inc eax
dec ebx
jmp nextstr1

; Return NULL if not found
notfound:
xor eax, eax

exit:
pop ebx
pop ebp
pop edi
pop esi
ret

; If length of str2 is zero, see if 1st
; character of str1 is zero. If so, return
; start of str1, else return NULL.
isstr1zero:
xor eax, eax
cmp byte ptr[esi], 0
cmove eax, esi
jmp exit
dstrstrA endp
end

Thank you,
DSF
"'Later' is the beginning of what's not to be."
D.S. Fiscus

Melzzzzz

unread,

Jul 30, 2014, 1:52:29 PM7/30/14

to

On Wed, 30 Jul 2014 03:05:05 -0400
DSF <nota...@address.here> wrote:

> Hello, group!
>
> I've been having some short-term memory problems lately and I would
> appreciate it if you could look over this code to replace strstr and
> see if I have "all my ducks in a row."

C strstr does not use n^2 algorithm.

Here is boyer moor horspool algorithm:

; rdi haystack,rsi size, rdx needle, rcx size
bmhorspoolasm:
xchg rsi,rdx
mov r11,rcx
mov al,[rsi+rcx-1]
mov ah,[rsi]
push rbx
xor rbx,rbx
.L0:
cmp rdx,r11
jl .fail
mov bl,[rdi+rcx-1]
cmp ah,[rdi]
jne .cont
cmp al,bl
jne .cont
xor r9d,r9d
.L1:
lddqu xmm0,[rdi+r9]
lddqu xmm1,[rsi+r9]
pcmpeqb xmm0,xmm1
pmovmskb r8d,xmm0
sub rcx,16
jle .shift
xor r8w,0xffff
jnz .cont
add r9,16
jmp .L1
.shift:
neg rcx
mov r10w,0xffff
shl r10w,cl
shl r8w,cl
xor r8w,r10w
jz .success
.cont:
mov rcx,r11
lea r9,[occ_table]
mov r9,[r9+rbx*8]
cmp r9,1
jle .inc
add rdi,r9
sub rdx,r9
jmp .L0
.inc:
inc rdi
dec rdx
jmp .L0
.fail:
pop rbx
xor eax,eax
ret
.success:
pop rbx
mov rax,rdi
ret

; rsi needle-str,rsi size , call this before search
; skip table
create_occ_table:
mov r8,rdi
mov rcx,256
mov rdi,occ_table
mov rax,rsi
rep stosq
mov rdi,r8
cmp rsi,1
jle .exit
dec rsi
xor rcx,rcx
.L0:
mov r8,rsi
sub r8,rcx
xor r9d,r9d
mov r9b,[rdi+rcx]
lea r10,[occ_table]
mov [r9*8 + r10],r8
inc rcx
cmp rcx,rsi
jl .L0
.exit:
ret

; place this in data segment

occ_table times 256 dq 0

--
Click OK to continue...

Terje Mathisen

unread,

Aug 2, 2014, 2:08:58 PM8/2/14

to

Melzzzzz wrote:
> On Wed, 30 Jul 2014 03:05:05 -0400
> DSF <nota...@address.here> wrote:
>
>> Hello, group!
>>
>> I've been having some short-term memory problems lately and I would
>> appreciate it if you could look over this code to replace strstr and
>> see if I have "all my ducks in a row."
>
> C strstr does not use n^2 algorithm.

Most uses of strstr() will be of a fairly short pattern string into
another not too long target, i.e. the Boyer-Moore setup overhead will
kill the performance.

I'd rather write a version with separate code for fast special cases,
i.e. detect a single-byte pattern as well as 2-4 byte patterns.

If the pattern is longer than 4 bytes I'd probably start by looking for
a match of the first four bytes, then go on to scan the remainder of the
target.

It is only when the pattern is significantly longer than 4 or 8 that it
really pays to go through the BM setup.

Terje

- <Terje.Mathisen at tmsw.no>
"almost all programming can be viewed as an exercise in caching"

Rod Pemberton

unread,

Aug 2, 2014, 4:46:31 PM8/2/14

to

On Sat, 02 Aug 2014 14:08:58 -0400, Terje Mathisen
<terje.m...@nospicedham.tmsw.no> wrote:
> Melzzzzz wrote:
>> On Wed, 30 Jul 2014 03:05:05 -0400
>> DSF <nota...@address.here> wrote:

>>> I've been having some short-term memory problems lately and I would
>>> appreciate it if you could look over this code to replace strstr and
>>> see if I have "all my ducks in a row."
>>
>> C strstr does not use n^2 algorithm.
>
> Most uses of strstr() will be of a fairly short pattern string into
> another not too long target, i.e. the Boyer-Moore setup overhead will
> kill the performance.
>
> I'd rather write a version with separate code for fast special
> cases, i.e. detect a single-byte pattern as well as 2-4 byte patterns.
>
> If the pattern is longer than 4 bytes I'd probably start by looking
> for a match of the first four bytes, then go on to scan the remainder
> of the target.
>
> It is only when the pattern is significantly longer than 4 or 8
> that it really pays to go through the BM setup.
>

Average word length in English is about 7 characters,
with English word lengths showing a large decline in
quantity at about 17 characters.

So, if it's for English words, he's probably better
off with a search for the first byte, then check English
word for a match. I.e., with such short lengths of
text, there is no need for 2 or 4 byte searches. Three
decades ago, this was quite fast even in interpreted
BASIC on a 1 Mhz 6502 processor.

Rod Pemberton

Terje Mathisen

unread,

Aug 2, 2014, 5:04:29 PM8/2/14

to

If English words are taken as the target, then you should at least scan
for the least common letter anywhere in the pattern, then use that as
the starting point of the search, since that will give (sometimes far)
less false positives...

I.e. (after the initial setup):

again:
jecxz not_found

repne scasb
jne not_found

push esi
push ecx
sub esi,[starting_offset]
mov ecx,[pattern_len]
mov edi,[pattern_start]
repe cmpsb
pop ecx
pop esi
jne again

found: ;; We have a match!
sub esi,[pattern_len]
mov eax,esi
ret

not_found:
xor eax,eax ;; Return NULL for not found
ret

Terje

Rod Pemberton

unread,

Aug 2, 2014, 6:41:16 PM8/2/14

to

On Sat, 02 Aug 2014 17:04:29 -0400, Terje Mathisen

While that would reduce false positives, isn't the time overhead of
identifying the least common character of short words sufficiently
large to justify *not* doing so for most cases? You have to scan
the string character by character likely using a 256 byte lookup
table, i.e., 7 lookups based on what I said about average length.
Whereas for a simple search, the check for the next character is
likely to reject. So, you'd have have more than 7 false matches
per beggining word letter in the given input stream before that
pays off the overhead cost. This is similar to the extra overhead
for computing a hash or another string comparison.

What is a large input to strstr()? You said:

TM> "fairly short pattern string into another not too long target".

I'll take that to mean my paragraph above is a "long target."
Let's see how many times a the first letter of a word is repeated
7 times or more. When taking the case of letters into account,
only words beginning with 3 letters justified the extra overhead,
letters 'c' with 9, 's' with 10, and 't' with 16. That means
searches are slowed down because of the overhead for all the other
beginning letters searched for. Letters 'i' with 7, 'l' with 7,
'o' with 7 almost paid off, but didn't. There are 25 beginning
letters in the paragraph, but the user can search for 52. So,
(3/52)*100 = 5.2% of the time the overhead improves things, and
makes things worse ((52-3)/52)*100 = 94% of the time.

If you want to add extra overhead to strstr() that's not likely
to be improve search times in general, there are lots of ways
to do that. If he was constructing a search routine for a
word processor where the input stream is much larger, things
would be different.

Rod Pemberton

Terje Mathisen

unread,

Aug 3, 2014, 11:32:57 AM8/3/14

to

Probably! :-)

> the string character by character likely using a 256 byte lookup
> table, i.e., 7 lookups based on what I said about average length.
> Whereas for a simple search, the check for the next character is
> likely to reject. So, you'd have have more than 7 false matches
> per beggining word letter in the given input stream before that
> pays off the overhead cost. This is similar to the extra overhead
> for computing a hash or another string comparison.

Right.

>
> What is a large input to strstr()? You said:
>
> TM> "fairly short pattern string into another not too long target".
>
> I'll take that to mean my paragraph above is a "long target."
> Let's see how many times a the first letter of a word is repeated
> 7 times or more. When taking the case of letters into account,
> only words beginning with 3 letters justified the extra overhead,
> letters 'c' with 9, 's' with 10, and 't' with 16. That means
> searches are slowed down because of the overhead for all the other
> beginning letters searched for. Letters 'i' with 7, 'l' with 7,

So what about a limited scan:

We have to check the length of the pattern to search for, so with
C-style strings we have to load all the bytes anyway.

We can lookup the first 2-4 bytes in a table, in order to check if this
is a "too-common" starting letter, stopping as soon as we find an
"uncommon" one or picking the best of those 2-4 first.

> 'o' with 7 almost paid off, but didn't. There are 25 beginning
> letters in the paragraph, but the user can search for 52. So,
> (3/52)*100 = 5.2% of the time the overhead improves things, and
> makes things worse ((52-3)/52)*100 = 94% of the time.
>
> If you want to add extra overhead to strstr() that's not likely
> to be improve search times in general, there are lots of ways

Nope! :-)

> to do that. If he was constructing a search routine for a
> word processor where the input stream is much larger, things
> would be different.

Absolutely, for locating all instances of "xyz" in a huge document, BM
is obviously well worth it!

DSF

unread,

Aug 17, 2014, 3:07:06 PM8/17/14

to

On Wed, 30 Jul 2014 03:05:05 -0400, DSF <nota...@address.here>
wrote:

>Hello, group!
>

My computer's been down for a while. I've read all of your comments
and while I haven't changed the basic algorithm, I did try to reduce
the calls to rep cmpsb because of the initial setup is costly. It
seemed wasteful to call rep cmpsb for every character of the search
string, so now it's only called on a character match.

; eax is used as an incremental index
; into str1 for the comparison code and
; ebx is the length (in characters) from
; the current position in str1 to the end

; Compensates for pre add and subtract
lea eax, [esi-1]
lea ebx, [ecx+1]

; Increment search position in str1 and
; decrement number of characters left
; in str1 to search.

nextstr1:
inc eax
dec ebx

; If the length of str2 > the current length of str1,

; str2 cannot possibly be found.
; Note that this also prevents cmpsb from
; overshooting the bounds of str1.

cmp edx, ebx
ja notfound

; Compare characters until match or end
; This favors the risk of a bad branch prediction against
; the setup time for rep cmpsb. Since typical branch
; prediction assumes backward branches are taken,
; I think it's a good move. rep cmpsb will only be
; called if a character match is found
mov, cl byte ptr[eax]
cmp, cl byte ptr[epb]
jne nextstr1

; Test for str2 in str1 at current position
; eax. Exit if found. eax will contain
; starting position of str2 in str1
mov ecx, edx
mov esi, eax
mov edi, ebp
cld
repe cmpsb

jne nextstr1

exit:
pop ebx
pop ebp
pop edi
pop esi
ret

; Return NULL if not found
notfound:
xor eax, eax

jmp exit

; If length of str2 is zero, see if 1st
; character of str1 is zero. If so, return
; start of str1, else return NULL.
isstr1zero:
xor eax, eax
cmp byte ptr[esi], 0
cmove eax, esi
jmp exit
dstrstrA endp
end

The code is a replacement for the generic strstr and therefore
should not be optimized for any particular situation. Source and
search strings need not contain words, English or otherwise. A better
way to look at it is that it searches "source" (a zero-terminated
character array) for "search" (another zero-terminated character
array).

In my current instance, at this time, I am parsing lines of HTML
code for particular phrases. Source can have a length of very few
characters to thousands and contain many "non-words." (This is why I
made the above changes. To reduce the possible thousands of calls to
rep cmpsb.)

I had thought of checking four bytes at a time, but ruled it out
because of alignment issues. You can't read four bytes at a time
unaligned because you might read outside your memory's boundary. (An
important piece of information I'd forgotten about during the trying
times I've been dealing with over the past few years. Fortunately, it
created a bug that came back to bite me in the ass very quickly. I
was *very* lucky that the slight overread actually read outside of the
compiler's memory allocation and caused an app fault. It could have
caused silent, difficult to trace, problems for a long time.)

Appreciate the code, Melzzzzz, but it's going to be quite a while
(if ever) before I try 64-bit programming.

I really was looking for input as to any glaringly obvious bugs.
Faster is better, but I wrote the code so I could also facilitate a
wide version. Besides, my ancient Borland compiler's strstr was
written in C which is quite literal in its compilation. (Check
comp.lang.c for "C can be very literal.")

For comparison, here is the C and assembly produced by the Borland
compiler for their C source for strstr: ...and time for a history
question. I notice the compiler always adds a negative number to esp.
Was there a time when add was faster than sub?

/*
* C/C++ Run Time Library - Version 2.0
* Copyright (c) 1987, 1996 by Borland International
* All Rights Reserved.
*/
char * _RTLENTRY _EXPFUNC strstr(const char *str1, const char *str2)
{
int len1 = strlen(str1);
int len2 = strlen(str2);
int i,j,k;

if (!len2)
return (char *)str1; /* return str1 if str2 empty */
if (!len1)
return 0; /* return NULL if str1 empty */
i = 0;
for(;;)
{
while(i < len1 && str1[i] != str2[0])
++i;
if (i == len1)
return 0;
j = 0;
k = i;
while (i < len1 && j < len2 && str1[i] == str2[j])
{
++i;
++j;
}
if (j == len2)
return (char *)str1+k;
if (i == len1)
return 0;
i = k+1;
}
}

push ebp
mov ebp,esp
add esp,-8 <-????
push ebx
push esi
push edi
mov esi,dword ptr [ebp+8]
@1:
push esi
call _strlen
pop ecx
mov ebx,eax

push dword ptr [ebp+12]
call _strlen
pop ecx
mov dword ptr [ebp-4],eax

mov eax,dword ptr [ebp-4]
test eax,eax
jne short @2
mov eax,esi
jmp short @3
@2:
test ebx,ebx
jne short @4
xor eax,eax
jmp short @3
@4:
xor eax,eax
jmp short @8
@7:
inc eax
@8:
cmp ebx,eax
jle short @9
mov dl,byte ptr [esi+eax]
mov ecx,dword ptr [ebp+12]
mov cl,byte ptr [ecx]
cmp dl,cl
jne short @7
@9:
cmp ebx,eax
jne short @10
xor eax,eax
jmp short @3
@10:
xor edx,edx
mov dword ptr [ebp-8],eax
jmp short @12
@11:
inc eax
inc edx
@12:
cmp ebx,eax
jle short @13
mov ecx,dword ptr [ebp-4]
cmp edx,ecx
jge short @13
mov cl,byte ptr [esi+eax]
mov edi,dword ptr [ebp+12]
cmp cl,byte ptr [edi+edx]
je short @11
@13:
mov ecx,dword ptr [ebp-4]
cmp edx,ecx
jne short @14
mov eax,dword ptr [ebp-8]
add eax,esi
jmp short @3
@14:
cmp ebx,eax
jne short @15
xor eax,eax
jmp short @3
@15:
mov eax,dword ptr [ebp-8]
inc eax
jmp short @8
@17:
@3:
pop edi
pop esi
pop ebx
pop ecx
pop ecx
pop ebp
ret
Any comments on the Borland code?

Frank Kotler

unread,

Aug 17, 2014, 7:13:13 PM8/17/14

to

DSF wrote:

...

> cmp, cl byte ptr[epb]

Typo alert!

Best,
Frank

Terje Mathisen

unread,

Aug 18, 2014, 3:27:02 AM8/18/14

to

DSF wrote:
> The code is a replacement for the generic strstr and therefore
> should not be optimized for any particular situation. Source and
> search strings need not contain words, English or otherwise. A better
> way to look at it is that it searches "source" (a zero-terminated
> character array) for "search" (another zero-terminated character
> array).

You can avoid the explicit strlen calls if you combine the search for
the initial char with the scan for a terminating zero, but that would
mean that you could not use REPNE SCASB to locate the first matching
char. I.e. you need to know the max length.

Given that you start with a REPNE SCASB to get the length of the buffer
to be searched, I would simply stick with the naive code:

REPNE SCASB to locate possible matches, followed by REPE CMPSB to check
for a full match.

If you want to be more fancy, then you should look hard at STRCMPI which
is more or less optimized to do this type of work:

Look at 16 byte (or 8 short) values at the same time, for up to 16/8
different target chars.

>
> In my current instance, at this time, I am parsing lines of HTML
> code for particular phrases. Source can have a length of very few
> characters to thousands and contain many "non-words." (This is why I
> made the above changes. To reduce the possible thousands of calls to
> rep cmpsb.)

For your current code I would do things quite differently:

With a big target buffer (html file) and a limited number of key words
to look for, again and again, it makes a _lot_ of sense to switch to
Boyer-Moore instead!

Take the fixed hit of setting up a 256-byte skip table for each pattern
to search for, then blow through the actual searches.

The main headache with HTML parsing isn't finding the key words, but to
locate the word delimiters: Any kind of whitespace, plus several
context-sensitive characters can delimit the current token.

Rod Pemberton

unread,

Aug 18, 2014, 5:13:38 AM8/18/14

to

On Sun, 17 Aug 2014 15:07:06 -0400, DSF <nota...@address.here> wrote:

> ; Get lengths for str2 in edx and
> ; str1 in ebx.

So, you're getting lengths for both strings first? ...

That means you're scanning each string at least twice: first to get the
length of each, then to do the search and/or comparison of the two. It
might be possible to search and compare and just stop if end-of-string
is reached. That would eliminate the extra scans for the string lengths.
Many C string routines do this. Although, from the code below, it seems
this one doesn't ...

> For comparison, here is the C and assembly produced by the Borland

> compiler for their C source for strstr: [...]
>

I'd suspect that there are only two ways for the assembly routine
to be faster than an efficient and optimized C routine:
1) if it uses x86 string instructions with REP prefix which many
C compilers won't emit
2) if it uses advanced, faster x86 instructions that C can't or
doesn't emit for string code in C, such as x87, MMX, SSEx etc

> /*
> * C/C++ Run Time Library - Version 2.0
> * Copyright (c) 1987, 1996 by Borland International
> * All Rights Reserved.
> */
> char * _RTLENTRY _EXPFUNC strstr(const char *str1, const char *str2)
> {
> int len1 = strlen(str1);
> int len2 = strlen(str2);
> int i,j,k;

Wow. The coder used I, J, and K!

Well, it seems they had a BASIC programmer code this C routine ...

> i = 0;
> for(;;)
> {
> while(i < len1 && str1[i] != str2[0])
> ++i;

... who hadn't learned how to use character pointers in C, yet.

The code would be more compact and faster if the programmer had used
character pointers, incrementing them to move to the next character,
and dereferencing it to retrieve characters for comparison. The code
would be more compact because he'd eliminate all or nearly all locals,
and much of the checks and looping. The subscript operator [] adds an
extra addition and multiplication per usage to the code as well.
I don't use strstr() and don't have a working version, but I'm not a
fan of the fact that pointers weren't used. It might've been easier
for the coder to halt/return/exit when a nul '\0' char was found too,
if he had used pointers.

So, this C code is probably a really bad example of how to implement
strstr(). I haven't looked at how others versions are implemented
since I've never, ever, needed or used it, ever.

You could look at GNU's GLIBC, or Redhat's newlib, or the C library
with a compiler such as OpenWatcom. If needed, I have P.J. Plauger's
"The Standard C Library" book, which implements an entire, compliant,
C library using standard C.

> ...and time for a history
> question. I notice the compiler always adds
> a negative number to esp.
> Was there a time when add was faster than sub?
>

> push ebp
> mov ebp,esp
> add esp,-8 <-????

"The ADD instruction does not distinguish between signed or unsigned
operands." -Intel IA-32 manual

I.e., the use of negative values was just a choice made by the assembler's
authors. E.g., they might've chosen a range of -128 to 127 for imm8
instead
of a range of 0 to 255. Other assemblers, like NASM, do this. Or,
perhaps,
they allowed negative values to allow automatic sizing, e.g., -1 can be
0xFF
or 0xFFFFFFFF depending on the register size of the instruction.

For x86, "add esp,-8" can be encoded in different ways:

81 /0 id ADD r/m32,imm32
83 /0 ib ADD r/m32,imm8

For 83, -8 would be encoded as 248 (0xF8 or 0F8h). For 81, -8 should
be encoded as 268435448 (0xFFFFFFF8). I.e., if you do a hex dump of
the assembled code you should see one of these:

81 C4 F8 FF FF FF
83 C4 F8

Since hexadecimal directly corresponds to bits in bytes and is unsigned,
it's usually better to think of all values in assembly as unsigned, IMO.
I.e., 0 to 255 is 00h to FFh, etc. You're going to be doing hex dumps.
IMO, it's a mistake to use negative values in assembly because of this,
unless the assembler supports an overlapping range of signed and unsigned,
e.g., -128 to 255 for -128 to 127 or 0 to 255. The only place negative
values are convenient in assembly IMO is -1 for all bits set, which is
integer or register size independent, i.e., can be 0xFF or 0xFFFFFFFF
depending on the register size.

> Was there a time when add was faster than sub?

For the 386, 486, and Pentium, that would be: "No." "add" and "sub"
have the same official timings on each respective processor. I don't
have official 286 or 86 (or 186) timings in my notes or more recent
processors. I'd suspect they're the same on each of the earlier
processors too. If they're different, I'd be curious to know why.

Rod Pemberton

Terje Mathisen

unread,

Aug 18, 2014, 6:37:00 AM8/18/14

to

Sorry, Rod, but this is totally wrong!

Modern C compilers will often generate better code for the array indexed
version than for the classic 'c = *buf++;' idioms.

In particular you can compile both types of code into indexed access,
with different base pointers:

; ESI points to
next:
mov al,[esi+ecx] ; Get next character from the target
mov bl,[edi+ecx] ; Compare with buffer character
inc ecx

test al,al ; \0 string terminator?
je match_found

cmp al,bl ; Do the characters match?
je next

no_match: ; If we fall down here then there is no match
...
match_found:

> I don't use strstr() and don't have a working version, but I'm not a
> fan of the fact that pointers weren't used. It might've been easier
> for the coder to halt/return/exit when a nul '\0' char was found too,
> if he had used pointers.
>
> So, this C code is probably a really bad example of how to implement
> strstr(). I haven't looked at how others versions are implemented
> since I've never, ever, needed or used it, ever.

:-)

For an html parser there is another important consideration: All tokens
can have arbitrary case! This means that Boyer-Moore which can handle
case wrapping transparently will be even faster.

Terje

Steve

unread,

Aug 18, 2014, 8:11:07 AM8/18/14

to

"Rod Pemberton" <dont_us...@nospicedham.xnothavet.cqm> writes:
>> Was there a time when add was faster than sub?
>
>For the 386, 486, and Pentium, that would be: "No." "add" and "sub"
>have the same official timings on each respective processor. I don't
>have official 286 or 86 (or 186) timings in my notes or more recent
>processors. I'd suspect they're the same on each of the earlier
>processors too.

Hi,

All my references (including dead tree versions from Intel and AMD)
have ADD and SUB with the same timings for the 8086/88, 80186/188,
and 80286 processors.

Regards,

Steve N.

Terje Mathisen

unread,

Aug 18, 2014, 2:32:22 PM8/18/14

to

When you look at the actual HW, ADD and SUB is pretty much always the
same operation, just a bit toggle on input:

a - b

is the same as

a + (-b)

where (-b) is (b ^ -1) + 1 (i.e. invert all the bits, then add 1)

so we get

a + (b ^ -1) + 1

I.e. an adder can subtract by inverting all the bits in the second
operand and set the incoming carry flag for the low bit.

This means that you'll never see a cpu where ADD & SUB have different
timing, or at least not one with two's complement binary numbers.
:-)

Terje

Bernhard Schornak

unread,

Aug 18, 2014, 4:08:57 PM8/18/14

to

I do not remember the timing for very old processors, but, beginning
with the first Athlon (2002 or so), add reg,reg / add reg, immediate
/ sub reg,reg / sub reg,immediate were performed in one clock cycle.
If the processor had to perform a not reg followed by an inc reg, it
should be a two clock operation (except the scheduler issues the not
and inc before the register is sent to the execution pipe). Both OPs
in one clock is (IMHO...) impossible.

Greetings from Augsburg

Bernhard Schornak

Andrew Cooper

unread,

Aug 18, 2014, 7:34:32 PM8/18/14

to

Performing an explicit not operation on a reg/imm (and writing the
result back into the register file) is different to having circuitry in
place in the ALU to choose between an input, and the not of the same input.

One will take the same length of time as another instruction, while the
latter will take a faction extra propagation delay through silicon.

As it turns out, it is more likely that addition is implemented in terms
of subtraction, as subtraction is more useful than addition for
computation decisions. As a perfect example, the only arithmetic
operation available in the Manchester Baby computer was subtract. It
also only had a "load and negate" operation for retrieving a value from
RAM (well - Williams tubes, which are what passed for RAM in those days)

~Andrew

Terje Mathisen

unread,

Aug 19, 2014, 3:58:34 AM8/19/14

to

You are misunderstanding me!

It is the hw which is doing an inline bitwise NOT, as part of the
routing of parameter B into the full adder: This takes _far_ less than a
full clock cycle, it is just a single XOR gate on each bit line, with
the second input being either 0 (i.e. do an ADD, no bit flip) or 1 for
the SUB case.

The same bit is also passed to the adder as the incoming carry to the
least significant bit.

A full 32 or 64 bit adder is a _big_ beast, a single XOR input gate is a
tiny addition.

In fact, if you can place the XOR gate somewhere on the path between the
register/bypass array and the ALU, it will take zero extra time!

This is because on modern cpus the wires have become a big problem: They
are actually slower than gates! This means that a medium-length wire
might well be replaced with an even number of NOT gates.

Bernhard Schornak

unread,

Aug 19, 2014, 12:02:02 PM8/19/14

to

No. ;)

> It is the hw which is doing an inline bitwise NOT, as part of the routing of parameter B into the
> full adder: This takes _far_ less than a full clock cycle, it is just a single XOR gate on each bit
> line, with the second input being either 0 (i.e. do an ADD, no bit flip) or 1 for the SUB case.

Okay. Nevertheless, this is a NOT, not a NEG. For the latter one,
we have to add one, as well (which might turn the sign or wrap to
zero if the previous inverting operation set all bits to one).

Nevertheless, the idea of XOR gates with a bit mask is appealing!

> The same bit is also passed to the adder as the incoming carry to the least significant bit.
>
> A full 32 or 64 bit adder is a _big_ beast, a single XOR input gate is a tiny addition.
>
> In fact, if you can place the XOR gate somewhere on the path between the register/bypass array and
> the ALU, it will take zero extra time!
>
> This is because on modern cpus the wires have become a big problem: They are actually slower than
> gates! This means that a medium-length wire might well be replaced with an even number of NOT gates.

Wavelength of my FX-8350 (4 GHz) is 7.5 cm, its �PGA case is 4.00
cm. Going down to structure level, a 1 nm distance will be passed
in 0.133 ns ~ 0.5 clock cycles. Structures are much larger (22 nm
is the smallest possible we can buy in larger quantities). I have
to agree if I don't want to ignore physical limits. ;)

Bernhard Schornak

unread,

Aug 19, 2014, 12:02:08 PM8/19/14

to

Andrew Cooper wrote:

>[...]

> One will take the same length of time as another instruction, while the
> latter will take a faction extra propagation delay through silicon.

Processors perform operations in single steps (clock cycles),
not as a continuous flow of signals like organic brains. When
there are two subsequent operations, they'll need two clocks,
not one.

wolfgang kern

unread,

Aug 19, 2014, 1:26:06 PM8/19/14

to

Bernhard Schornak in discussion with Terje Mathisen:

...about max. performance

> Wavelength of my FX-8350 (4 GHz) is 7.5 cm, its �PGA case is 4.00
> cm. Going down to structure level, a 1 nm distance will be passed
> in 0.133 ns ~ 0.5 clock cycles. Structures are much larger (22 nm
> is the smallest possible we can buy in larger quantities). I have
> to agree if I don't want to ignore physical limits. ;)

:)
beside that speed of light is just ~3.3 nanosec/meter, it's the wave-length
which limits the frequency of clocks. 3GHZ mean total reflection at 100mm.

Terje is right that modern CPU-desings load both and even more on
every MOV register,...-instruction.

So a simple MOV reg,... will also load the reversed by default and
in case of any followed (whithin instruction-fetch) shift/rotate also
load it to a 'bit-addressing'-logic where shifts can be worked out by
an alternate bit-addressing rather than be really shifted like in old
days. (486: Intel CPUs took a while to take over this AMD-feature :)

__
wolfga

James Harris

unread,

Aug 19, 2014, 1:34:27 PM8/19/14

to

"Bernhard Schornak" <scho...@nospicedham.web.de> wrote in message
news:lsvsbm$pb6$2...@dont-email.me...

> Andrew Cooper wrote:
>
>
>>[...]
>> One will take the same length of time as another instruction, while the
>> latter will take a faction extra propagation delay through silicon.
>
>
> Processors perform operations in single steps (clock cycles),
> not as a continuous flow of signals like organic brains. When
> there are two subsequent operations, they'll need two clocks,
> not one.

Not exactly. What a CPU can do in a clock cycle depends on the hardware that
it has between storage elements. Best I could find from a quick search was

https://www.cs.auckland.ac.nz/~jmor159/363/html/anatomy.html

Look at the diagram which shows blocks of combinatorial logic between
storage elements - what it calls registers (though these are banks of
flip-flops and not necessarily just the CPU registers assembly programmers
think of). On each clock the registers get updated with the signals which
come in to them from out of the logic blocks. (All logic blocks will have
been expected to have settled by the time of the clock edge.)

Depending on timing budgets CPUs can be designed to carry out a number of
operations in some of their combinational logic blocks. It is too big a
subject to go into here. Check out CPU datapath design for more info if you
are interested. Suffice to say that in the context of this discussion a CPU
can carry out multiple operations in a single clock cycle if it has been
designed to do so.

James

Rod Pemberton

unread,

Aug 19, 2014, 4:04:49 PM8/19/14

to

On Tue, 19 Aug 2014 13:34:27 -0400, James Harris
<james.h...@nospicedham.gmail.com> wrote:

> Suffice to say that in the context of this discussion a CPU
> can carry out multiple operations in a single clock cycle

That concept does have a name:
http://en.wikipedia.org/wiki/Pipeline_(computing)

It also has another name:
http://en.wikipedia.org/wiki/Complex_instruction_set_computing

> if it has been designed to do so.

It would have to be really old microprocessor, i.e. pre-6502,
or perhaps a central processing unit, or a student processor
project, to _not_ have "been designed to do so." Basically,
every microprocessor post-6502 has pipelining to increase
average work per clock.

Rod Pemberton

DSF

unread,

Aug 19, 2014, 4:10:23 PM8/19/14

to

On Sun, 17 Aug 2014 19:13:13 -0400, Frank Kotler
<fbko...@nospicedham.myfairpoint.net> wrote:

>DSF wrote:
>
>...
>> cmp, cl byte ptr[epb]
>
>Typo alert!

Yes. And the mov line above it, too. The new code posted here was
comprised of a lot of cut/paste operations because I didn't feel like
aligning the whole thing again, so I modified the existing post. I
must have typed those two lines into the post because the actual
source is correct.

Bernhard Schornak

unread,

Aug 19, 2014, 4:13:58 PM8/19/14

to

wolfgang kern wrote:

> Bernhard Schornak in discussion with Terje Mathisen:
>
> ...about max. performance
>
>> Wavelength of my FX-8350 (4 GHz) is 7.5 cm, its �PGA case is 4.00
>> cm. Going down to structure level, a 1 nm distance will be passed
>> in 0.133 ns ~ 0.5 clock cycles. Structures are much larger (22 nm
>> is the smallest possible we can buy in larger quantities). I have
>> to agree if I don't want to ignore physical limits. ;)
>
> :)
> beside that speed of light is just ~3.3 nanosec/meter, it's the wave-length
> which limits the frequency of clocks. 3GHZ mean total reflection at 100mm.

Only if there is an obstacle where the waves are reflected.
Satellite TV works at distances of ten-thousands of km in a
frequency range between 10.6 and 12.7 GHz. Only transfer is
slightly difficult - cable dampening increases with growing
frequency. (For frequencies beyond 10 GHz, waveguides are a
must).

> Terje is right that modern CPU-desings load both and even more on
> every MOV register,...-instruction.

A simple MOV is not a complex operation like NOT-INC-ADD or
NOT-INC-SUB. I still doubt these operations can be executed
in one clock cycle. As I speculated in my first posting, it
might be issued by the instruction decoder/scheduler, so it
doesn't matter how many clocks are required, because proper
data is loaded into the involved registers before the final
operation is performed.

> So a simple MOV reg,... will also load the reversed by default and
> in case of any followed (whithin instruction-fetch) shift/rotate also
> load it to a 'bit-addressing'-logic where shifts can be worked out by
> an alternate bit-addressing rather than be really shifted like in old
> days. (486: Intel CPUs took a while to take over this AMD-feature :)

Okay. Actually, I did not know this. It surely is not taken
from AMD's programming guides... ;)

Pf�at'Di

Bernhard

Bernhard Schornak

unread,

Aug 19, 2014, 4:14:12 PM8/19/14

to

For sure, but not if the result is required for the next step.
No physical hardware can perform an addition if the neccessary
negation of one operand is in progress. This dependency cannot
be resolved in one and the same clock cycle.

DSF

unread,

Aug 19, 2014, 4:42:35 PM8/19/14

to

On Mon, 18 Aug 2014 05:13:38 -0400, "Rod Pemberton"
<dont_us...@nospicedham.xnothavet.cqm> wrote:

>On Sun, 17 Aug 2014 15:07:06 -0400, DSF <nota...@address.here> wrote:
>

{Analysis of C code snipped}

I understand how they could use adding a negative, I was interested
in, as expressed below, why when there is a perfectly good sub
instruction? Unless at the time the stack frame creation part of the
compiler was written, add was faster than sub.

I did a prehistoric Google search and came up with this page:
http://www.cs.dartmouth.edu/~mckeeman/cs48/mxcom/doc/x86.html

Excerpts:
ADD Integer addition
operands bytes 8088 186 286 386 486 Pentium
reg, imm 2+i(1,2) 4 4 3 2 1 1

SUB Integer subtraction
operands bytes 8088 186 286 386 486 Pentium
reg, imm 2+i(1,2) 4 4 3 2 1 1

Same all the way back here.

An issue of style or training, I guess.

>> Was there a time when add was faster than sub?
>
>For the 386, 486, and Pentium, that would be: "No." "add" and "sub"
>have the same official timings on each respective processor. I don't
>have official 286 or 86 (or 186) timings in my notes or more recent
>processors. I'd suspect they're the same on each of the earlier
>processors too. If they're different, I'd be curious to know why.
>
>
>Rod Pemberton

James Harris

unread,

Aug 19, 2014, 5:24:51 PM8/19/14

to

"Rod Pemberton" <dont_us...@nospicedham.xnothavet.cqm> wrote in message
news:op.xkuvebjq6zenlw@localhost...

> On Tue, 19 Aug 2014 13:34:27 -0400, James Harris
> <james.h...@nospicedham.gmail.com> wrote:
>
>> Suffice to say that in the context of this discussion a CPU
>> can carry out multiple operations in a single clock cycle
>
> That concept does have a name:
> http://en.wikipedia.org/wiki/Pipeline_(computing)

Pipelining is just one of *many* ways a CPU can carry out multiple ops per
clock cycle but that panoply is way off topic here. That's why I mentioned
that my reply was "in the context of this discussion" in that it applied
solely to combinational logic between storage elements. That was Bernhard's
point of issue so let's stick to that.

James

James Harris

unread,

Aug 19, 2014, 6:06:57 PM8/19/14

to

"Bernhard Schornak" <scho...@nospicedham.web.de> wrote in message

news:lt0b4a$bej$2...@dont-email.me...

This is too complex a topic to go into here but the combinational logic
doesn't occur in steps in the sense that I think you mean. Instead, signals
propagate through it. Any kind of "step" only occurs when the output of the
logic is latched into a subsequent storage element. Of course, as assembler
programmers we think of steps ... but the CPU will operate at a lower level
than that and can easily be wired to carry out more than one operation in a
given part of the data path.

For example, take a look at

http://en.wikipedia.org/wiki/Adder%E2%80%93subtractor

Note that the adder is preceded by a layer of not gates which form a 1's
complement of one of the inputs. When the C0 carry in is included the
circuit is designed to carry out a subtraction using an adder but there is
no need for a "step" between the two parts.

James

Bernhard Schornak

unread,

Aug 19, 2014, 7:58:32 PM8/19/14

to

I was speaking as an "electronics freak" with TTL or CMOS logic
in mind. In the end, processors are nothing else than large and
complex logic arrays with millions of logic circuits.

I am sure the scheduler (the unit following the instruction de-
coder) is responsible for pre-processing stuff like getting the
2's complement if it was required for add/sub.

BTW: The clock is the signal used to move data from one 'stage'
of a processors units to the next. There are no analog signals,
only digital ones (which are handed through the execution chain
whenever the clock signal turns from high to low or vice versa,
depending on the logic design).

> For example, take a look at
>
> http://en.wikipedia.org/wiki/Adder%E2%80%93subtractor
>
> Note that the adder is preceded by a layer of not gates which form a 1's
> complement of one of the inputs. When the C0 carry in is included the
> circuit is designed to carry out a subtraction using an adder but there is
> no need for a "step" between the two parts.

Probably the thing Terje had in mind. I still can't see why the
subtraction can be replaced by an addition of the 1's (not 2's)
complement. Maybe I am too dumb to get the point, but I know my
calculator shows a wrong result if I add the 1's instead of the
2's complement of the number to subtract... ;)

Robert Wessel

unread,

Aug 20, 2014, 12:03:15 AM8/20/14

to

The one gets added with an inverted carry input.

Rod Pemberton

unread,

Aug 20, 2014, 1:56:56 AM8/20/14

to

On Tue, 19 Aug 2014 17:24:51 -0400, James Harris
<james.h...@nospicedham.gmail.com> wrote:

> Pipelining is just one of *many* ways a CPU can carry out

> multiple ops per clock cycle [...]

True, that's _why_ I mentioned CISC too ...

> but that panoply is way off topic here.

Anything unrelated to strstr and x86 assembly code is "way
off topic here." But, that's exactly where you came in, right
when Bernard, or perhaps Andrew, went off-topic. So, what's
your point? I doubt you're criticizing yourself for being OT ...

> That's why I mentioned that my reply was "in the context of
> this discussion" in that it applied solely to combinational
> logic between storage elements. That was Bernhard's point
> of issue so let's stick to that.

Well, I don't agree that that was Bernard's point. I see no
mention or implication of intermediate combinational logic in
any of his statements. His statement "not as a continuous
flow of signals like organic brains" more accurately describes
analog computation than it coes combinational logic, although
either could be assumed. The remainder of his statement aptly
describes either pipelining or CISC, which neither of you used
by name. ISTM, the aspect of combinational logic of the thread
was entirely introduced or assumed by you. Part of my point
that you missed was that you two were being exceptionally vague
about what exactly was under discussion. As I've indicated,
there are multiple possible reasonable choices that a reader
could comprehend.

Rod Pemberton

Rod Pemberton

unread,

Aug 20, 2014, 2:10:40 AM8/20/14

to

On Wed, 20 Aug 2014 01:56:56 -0400, Rod Pemberton
<dont_us...@nospicedham.xnothavet.cqm> wrote:

> Well, I don't agree that that was Bernard's point. I see no
> mention or implication of intermediate combinational logic in
> any of his statements. His statement "not as a continuous
> flow of signals like organic brains" more accurately describes
> analog computation than it coes combinational logic, although
> either could be assumed.

To add even more confusion to his statements, he could've
meant been referencing asynchronous digital circuitry using
static ram, but that's not how the human brain works.

Rod Pemberton

Terje Mathisen

unread,

Aug 20, 2014, 3:57:05 AM8/20/14

to

"Speculate" is the operative word here:

You simply don't know what you are talking about. :-(

Please do some actual study and then come back. (You don't have to crawl
on your knees. :-) )

Re. the 32-bit adder with an XOR gate on the input:

In order to use ADD instead of SUB in asm code, you do have to negate
the second operand, something which needs exactly the same 32-bit wide
carry propagation as a full add or sub, so this can't be any faster.

However, what I wrote was that you simply negate each bit, i.e. XOR with
1: This is 32 independent XOR gates and takes a very small fraction of a
cycle.

In order to get the effect of NEG after doing a NOT we have to add 1,
but this is exactly the same as setting the incoming carry bit to the
bottom of the adder unit.

I.e. in addition to having carry propagation between each bit position,
we also have an incoming carry, making all the stages identical.

Terje

> might be issued by the instruction decoder/scheduler, so it
> doesn't matter how many clocks are required, because proper
> data is loaded into the involved registers before the final
> operation is performed.
>
>
>> So a simple MOV reg,... will also load the reversed by default and
>> in case of any followed (whithin instruction-fetch) shift/rotate also
>> load it to a 'bit-addressing'-logic where shifts can be worked out by
>> an alternate bit-addressing rather than be really shifted like in old
>> days. (486: Intel CPUs took a while to take over this AMD-feature :)
>
>
> Okay. Actually, I did not know this. It surely is not taken
> from AMD's programming guides... ;)
>
>
> Pf�at'Di
>
> Bernhard

wolfgang kern

unread,

Aug 20, 2014, 5:04:29 AM8/20/14

to

Bernhard Schornak wrote:

...

>> Terje is right that modern CPU-desings load both and even more on
>> every MOV register,...-instruction.

> A simple MOV is not a complex operation like NOT-INC-ADD or
> NOT-INC-SUB. I still doubt these operations can be executed
> in one clock cycle. As I speculated in my first posting, it
> might be issued by the instruction decoder/scheduler, so it
> doesn't matter how many clocks are required, because proper
> data is loaded into the involved registers before the final
> operation is performed.

>> So a simple MOV reg,... will also load the reversed by default and
>> in case of any followed (whithin instruction-fetch) shift/rotate also
>> load it to a 'bit-addressing'-logic where shifts can be worked out by
>> an alternate bit-addressing rather than be really shifted like in old
>> days. (486: Intel CPUs took a while to take over this AMD-feature :)

> Okay. Actually, I did not know this. It surely is not taken
> from AMD's programming guides... ;)

This parallel Load-features became quite popular on modern CPUs of any
kind. So whenever a register (there is more than one RAX etc. inside)
is loaded, then a converted copy is present at the very same time.
If you look at TTL flipflops, you got two output-pins (Q and notQ).
So there is no time-difference between non- and -inverting gating.
__
wolfgang

James Harris

unread,

Aug 20, 2014, 5:03:48 AM8/20/14

to

"Rod Pemberton" <dont_us...@nospicedham.xnothavet.cqm> wrote in message

news:op.xkvms6x96zenlw@localhost...

> On Tue, 19 Aug 2014 17:24:51 -0400, James Harris
> <james.h...@nospicedham.gmail.com> wrote:
>
>> Pipelining is just one of *many* ways a CPU can carry out
>> multiple ops per clock cycle [...]
>
> True, that's _why_ I mentioned CISC too ...
>
>> but that panoply is way off topic here.
>
> Anything unrelated to strstr and x86 assembly code is "way
> off topic here." But, that's exactly where you came in, right
> when Bernard, or perhaps Andrew, went off-topic. So, what's
> your point? I doubt you're criticizing yourself for being OT ...

Yes, that's true about the thread subject: we are already off topic. I was
thinking that if we got into the many ways processors carry out mutliple
operations per clock cycle we will be getting way off topic! And, besides, I
didn't want to get into a deep discussion on this. I was just trying to help
Berhard to see that Andrew's comment was valid. (In fact, the
adder-subtractor that I linked to was an example of what Andrew was talking
about.)

>> That's why I mentioned that my reply was "in the context of
>> this discussion" in that it applied solely to combinational
>> logic between storage elements. That was Bernhard's point
>> of issue so let's stick to that.
>
> Well, I don't agree that that was Bernard's point. I see no
> mention or implication of intermediate combinational logic in
> any of his statements. His statement "not as a continuous
> flow of signals like organic brains" more accurately describes
> analog computation than it coes combinational logic, although
> either could be assumed. The remainder of his statement aptly
> describes either pipelining or CISC, which neither of you used
> by name. ISTM, the aspect of combinational logic of the thread
> was entirely introduced or assumed by you. Part of my point
> that you missed was that you two were being exceptionally vague
> about what exactly was under discussion. As I've indicated,
> there are multiple possible reasonable choices that a reader
> could comprehend.

Water under the bridge now but Bernhard had said that two subsequent
operations need two clock cycles. That's how an asm programmer would
naturally think and may be how a given CPU has been built but, as
illustrated, a CPU can be built differently and have *multiple* logic blocks
between flip-flop stages.

Incidentally, and mainly for interest, the best modern example of the
principle that I can think of is the Arm. It has a normal ALU with two
inputs but, IIRC, while one of the inputs goes directly into the ALU the
other can be significantly transformed by a logic block. The second input to
the ALU can be shifted logically or arithmetically, or it can be rotated
left or right with or without the carry! The extra transformation doesn't
need any additional clock cycles; both transformation and ALU operation
happen in one go. In other words the transformation is designed to be
applied before one leg of the ALU with no intervening storage element.

That leads to some strange-looking assembly code such as (and this is from
memory so may need to be tweaked)

add R0, R1, R2 ror 6

meaning load into R0 the sum of R1 and (R2 rotated right 6 places without
the carry). On x86 we would have to code the rotation and addition
separately but on Arm they can be part of a single instruction.

Similar gymnastics can be applied to immediate constants and to memory
addresses which can lead to clever ways to specify big constants in 12 bits
and clever ways to step through memory. It's a fascinating design - for
anyone who is interested, that is.

As I say, way off topic. :-)

James

Bernhard Schornak

unread,

Aug 20, 2014, 11:49:31 AM8/20/14

to

Yes, and I used it intentionally.

> You simply don't know what you are talking about. :-(
>
> Please do some actual study and then come back. (You don't have to crawl on your knees. :-) )

Thank you for the overwhelming information packed into a few
sentences! ;)

> Re. the 32-bit adder with an XOR gate on the input:
>
> In order to use ADD instead of SUB in asm code, you do have to negate the second operand, something
> which needs exactly the same 32-bit wide carry propagation as a full add or sub, so this can't be
> any faster.

I never asked why carry propagation is faster, I'm searching
for a physical explanation how signals can be high *and* low
in the same clock cycle. In other words: I try to understand
how this adder works on transistor level. The MOSFETs in the
adder either work as conductor or as insulator, depending on
the potential applied to their gates. If this potential lies
between the defined levels for high and low, the results are
in an undefined state (jitter/noise).

> However, what I wrote was that you simply negate each bit, i.e. XOR with 1: This is 32 independent
> XOR gates and takes a very small fraction of a cycle.

Is there a physical explanation how one single transistor on
the die can have multiple potentials within one low-high-low
transition of the clock signal? In general, output potential
is a function of the input potential. If there were multiple
possible potentials *within* a single clock cycle, we didn't
need a clock signal any longer. *Multiple* potentials in one
and the same clock cycle are possible in analog circuits (e.
g. the FET booster I developed for my guitars), but never in
digital ones.

> In order to get the effect of NEG after doing a NOT we have to add 1, but this is exactly the same
> as setting the incoming carry bit to the bottom of the adder unit.
>
> I.e. in addition to having carry propagation between each bit position, we also have an incoming
> carry, making all the stages identical.

Sorry, but I do not buy this. I have to refresh my knowledge
about the signal flow inside CMOS adders before I will reply
to the last two paragraphs.

Bernhard Schornak

unread,

Aug 20, 2014, 11:49:49 AM8/20/14

to

wolfgang kern wrote:

> This parallel Load-features became quite popular on modern CPUs of any
> kind. So whenever a register (there is more than one RAX etc. inside)
> is loaded, then a converted copy is present at the very same time.
> If you look at TTL flipflops, you got two output-pins (Q and notQ).
> So there is no time-difference between non- and -inverting gating.

I agree with all points, but: Loading data into registers is
done early in the execution chain - long before the register
is sent to the execution pipeline. There are at least two or
more clocks between decoding and executing each instruction.

What I cannot agree with are *multiple* potential changes of
one specific logic gate within one and the same clock cycle.
This simply contradicts my knowledge and experience with any
digital logic family I ever worked with.

Pf�at'Di

Bernhard

Robert Wessel

unread,

Aug 20, 2014, 12:06:21 PM8/20/14

to

On Wed, 20 Aug 2014 11:04:29 +0200, "wolfgang kern" <now...@never.at>
wrote:

While that a natural consequence of TTL (and often CMOS) flip flops,
actually getting the output to the next stage requires drive circuits
of the inverted output (assuming it exists). Those are not free. And
doubling the number of traces you need in large parts of the processor
just because you occasionally want the inverted values is pretty
unlikely to be a win.

Terje Mathisen

unread,

Aug 20, 2014, 3:01:47 PM8/20/14

to

How _do_ you think any logic circuit works?

You have stated that you are familiar with binary logic and gates, but
it still seems that you think the clock cycles in a modern CPU define
the actual single gate time?

A modern adder will of course NOT depend on a single ripple chain in
order to get a bottom carry which turns all the higher bits to flip, all
the way to the top.

Let's start with the basics:

I assume you know what a full adder is?

It takes three single-bit inputs and provides two outputs, the sum and
the carry (if the sum of the three bits turned out to 2 or 3).

Using single logic gates you can do this with something like this:

out = (a ^ b) ^ carry_in
carry_out = (a & b) | (a & carry_in) | (b & carry_in)

which you can simplify to this, using pairwise simple operations

t1 = a ^ b
t2 = a & b

out = t1 ^ carry_in
t3 = t1 & carry_in

carry_out = t2 | t3;

I.e. this is a minimum of 3 clock cycles if implemented in asm
operations, but only 3 gate times for the HW.

Chaining 32 such blocks together would give us the 32-bit result plus an
outgoing carry bit, and it would take about 96 gate times.

Now, even if there is room for quite a few gate times between each clock
cycle, we don't have room for 96 of them (plus whatever is needed for
routing the input and output bits!), so we have to make the carry
propagation much faster, right?

One method is to split the adder into smaller blocks, let's start with
three 16-bit adders:

One adder does the bottom 16 bits, returning a result in 16x3=48 gate
delays, and the two other do the high 16 bits: One of them assume that
there will be no incoming carry from the lower half while the other one
is wired with carry high.

Both produces their results in the same 48 gate times as the low half,
then we use the carry from the low end to select the correct result,
getting the final answer in about 50 gate times instead of nearly 100.

The same approach can be used recursively for each of the 16-bit adders
etc, in the end we can get the final result in O(log(N)) time instead of
O(N).

Rod Pemberton

unread,

Aug 20, 2014, 7:37:12 PM8/20/14

to

On Wed, 20 Aug 2014 11:49:31 -0400, Bernhard Schornak
<scho...@nospicedham.web.de> wrote:

> I never asked why carry propagation is faster, [...]

> I'm searching for a physical explanation how signals
> can be high *and* low in the same clock cycle.

There can be a number of answers to that.

Digital electronic signals are discretized. It takes some portion
of a clock for a signal to settle or change. This may be slightly
longer for one signal and shorter for another. Both are completed
during the clock. Think sampled analog instead of strict digital,
where the analog signal is two-level binary. The analog wouldn't
change specificly on clock boundaries but would settle and change
between clocks. E.g., one signal path may have 5 transistors
where another has 25. Each transistor has a certain amount of
time delay per state change since it's a physical device. There
is also clock related switching delay. So, the signal path with
25 transisters will take longer for the signal to propagate.

Also, digital clock speeds can be higher internally than externally.
E.g., you may have a internal PLL (phase-locked loop) running at
4Mhz with a 1Mhz external clock. So, you have four internal clock
transitions per one external clock. The internal signals could be
high and low multiple times per clock.

> Is there a physical explanation how one single transistor on
> the die can have multiple potentials within one low-high-low
> transition of the clock signal?

A "digital" transistor is NOT a digital device. It's analog.
It's modeled as a on-off switch and used as one, but it's still
an analog device. A transistor in digital circuits is not suitable
for audio signals. It's suitable for signals which can change
between two voltage extremes: high and low. There is still an
analog portion of the signal, when the transistor is changing from
high to low or low to high. It's a very fast transistion, but not
instantaneous. Think of an audio transistor driven into clipping.

> In general, output potential
> is a function of the input potential. If there were multiple
> possible potentials *within* a single clock cycle, we didn't
> need a clock signal any longer.

Why? You'd still want a way to indicate a change of state and
steady state conditions. That what the clock does.

It seems you keep coming back to this issue, like you're interested
in having the electronics free-run as fast as possible without a
clock. That's known as asynchronous. Asynchronous digital circuits
have no clocking.

http://en.wikipedia.org/wiki/Asynchronous_circuit

> *Multiple* potentials in one and the same clock cycle are possible

> in analog circuits (e.g. the FET booster I developed for my guitars),

> but never in digital ones.

I'm pretty sure they've developed (two types) of three state logic:

http://en.wikipedia.org/wiki/Three-state_logic
http://en.wikipedia.org/wiki/Ternary_logic

Rod Pemberton
(I'm not sure Bernd is seeing my posts ...)

Bernhard Schornak

unread,

Aug 21, 2014, 1:19:17 PM8/21/14

to

Rod Pemberton wrote:

> On Wed, 20 Aug 2014 11:49:31 -0400, Bernhard Schornak <scho...@nospicedham.web.de> wrote:
>
>> I never asked why carry propagation is faster, [...]
>
>> I'm searching for a physical explanation how signals
>> can be high *and* low in the same clock cycle.
>
> There can be a number of answers to that.
>
> Digital electronic signals are discretized. It takes some portion
> of a clock for a signal to settle or change. This may be slightly
> longer for one signal and shorter for another. Both are completed
> during the clock. Think sampled analog instead of strict digital,
> where the analog signal is two-level binary. The analog wouldn't
> change specificly on clock boundaries but would settle and change
> between clocks. E.g., one signal path may have 5 transistors
> where another has 25. Each transistor has a certain amount of
> time delay per state change since it's a physical device. There
> is also clock related switching delay. So, the signal path with
> 25 transisters will take longer for the signal to propagate.

Agreed. At frequencies beyond 2 GHz, short wavelengths are an
important factor (Terje and Wolfgang mentioned this before).

> Also, digital clock speeds can be higher internally than externally.
> E.g., you may have a internal PLL (phase-locked loop) running at
> 4Mhz with a 1Mhz external clock. So, you have four internal clock
> transitions per one external clock. The internal signals could be
> high and low multiple times per clock.

Probably not the case if the clock frequency is above 2 GHz -
power consumption grows markably with rising frequency.

>> Is there a physical explanation how one single transistor on
>> the die can have multiple potentials within one low-high-low
>> transition of the clock signal?
>
> A "digital" transistor is NOT a digital device. It's analog.
> It's modeled as a on-off switch and used as one, but it's still
> an analog device. A transistor in digital circuits is not suitable
> for audio signals. It's suitable for signals which can change
> between two voltage extremes: high and low. There is still an
> analog portion of the signal, when the transistor is changing from
> high to low or low to high. It's a very fast transistion, but not
> instantaneous. Think of an audio transistor driven into clipping.

Agreed. I was aware that transistors do not work in "digital"
manner, but MOSFETs inside processors are trimmed to *behave*
like "digital" devices (shortest possible delays between 'on'
and 'off' state switches).

Transistors, especially FETs, work as capacitors, as well. At
high frequencies, capacities, including signal or power lanes
on dies, are the main problem: Signals are dampened and para-
sitary charges (better: change of the charges) increase power
consumption.

BTW: You can (ab)use AND gates as audio amplifiers. CMOS-FETs
do not clip hard like bipolar transistors. They deliver some-
thing like the legendary "valve sound"...

>> In general, output potential
>> is a function of the input potential. If there were multiple
>> possible potentials *within* a single clock cycle, we didn't
>> need a clock signal any longer.
>
> Why? You'd still want a way to indicate a change of state and
> steady state conditions. That what the clock does.
>
> It seems you keep coming back to this issue, like you're interested
> in having the electronics free-run as fast as possible without a
> clock. That's known as asynchronous. Asynchronous digital circuits
> have no clocking.
>
> http://en.wikipedia.org/wiki/Asynchronous_circuit

I will have a look at that when I have access to DSL, again -
currently, I surf with an internet stick (where each MB costs
ï¿½ 0,24)...

>> *Multiple* potentials in one and the same clock cycle are possible
>> in analog circuits (e.g. the FET booster I developed for my guitars),
>> but never in digital ones.
>
> I'm pretty sure they've developed (two types) of three state logic:
>
> http://en.wikipedia.org/wiki/Three-state_logic
> http://en.wikipedia.org/wiki/Ternary_logic
>
> Rod Pemberton
> (I'm not sure Bernd is seeing my posts ...)

I do (and I read them, as well). As long as I agree with your
arguments, I will not reply if I do not have to add something
new... ;)

Bernhard Schornak

unread,

Aug 21, 2014, 1:19:47 PM8/21/14

to

It seems we are talking about very different things. This is
what I know as "adder":

https://www.ee.iitb.ac.in/uma/~wel/wel12/Components%2520Records/digital%2520ic%2520datasheet/cd4008.pdf

There is a better data sheet for Intersil's 4008, but I have
no link for a download location (still own a paper version).
CD4000 structures are quite large - it should be possible to
design small 64 bit adders with up-to-date 22 nm lithography
technology.

BTW: Your gate times might be interpreted wrong, because the
32 (or 64) bit-additions are performed simultaneously, so it
always takes one "step" to add N bits. Depending on the chip
design, the carries could be added within the same step when
they are forwarded through a faster "path" than the results.
As you told before: Propagation through gates is faster than
propagation via lanes (the die's "wires"), so the result can
be delivered within a single clock cycle, because the signal
'moves' from the registers through the bit adders and can be
'delayed' by routing through lanes while carries are sent to
non-inverting gates and 'reach' the final stage of the adder
together with the results.

The last paragraph assumes we 'watch' 2 * 32 (or 64) signals
'creeping' through silicon structures during the active part
of the clock signal. While active, the signals move some nm,
depending on the clock frequency, and 'reach' the next stage
of the execution pipeline (Rod's postings!) before the clock
returns to its inactive state.

What I stumbled upon was your

"...takes a very small fraction of a cycle..."

which is a little bit confusing, because the clock signal is
the smallest possible unit in a clock driven machine (like a
quark in the Standard Model). With wave propagation in mind,
it gets less confusing, though... ;)

George Neuner

unread,

Aug 22, 2014, 2:03:12 AM8/22/14

to

On Tue, 19 Aug 2014 18:34:27 +0100, "James Harris"
<james.h...@nospicedham.gmail.com> wrote:

>What a CPU can do in a clock cycle depends on the hardware that
>it has between storage elements.

And just to throw more fuel on the fire ... over in comp.arch, Ivan
Godard has been talking about a new chip design he has been working
with that (at least theoretically) can execute 30+ instructions per
cycle while consuming only DSP-like power. See http://ootbcomp.com/.

George

Terje Mathisen

unread,

Aug 22, 2014, 3:09:17 AM8/22/14

to

You might have noticed that I have been part of that project since February?

In fact, it has been work on defining and implementing hw helper
functions for a sw fp emulation library (for Mill versions too tiny to
have a proper fp unit) that have forced me to update my knowledge about
what is and isn't doable in a single clock cycle on modern hw.
:-)

Anyway, the Mill is the most excitement there has been the cpu
architecture area in at least a decade, but since the main designer,
Ivan G, has a very strong background in compiler design and
implementation, the Mill is designed to never need asm code.

In fact, there is no real asm language defined for this architecture:
Every single Mill model will have an individually optimized instruction
set, using arithmetic coding/compression for maximum density.

Programs will be compiled and distributed in a model-independent
intermediate format, then a model-specific specializer (i.e. really a
form of JIT compiler) will convert this into the actual/current
instruction set.

It is during this stage that any unimplemented instructions will be
replaced by emulation sequences/calls.

Rod Pemberton

unread,

Aug 22, 2014, 3:21:21 AM8/22/14

to

On Thu, 21 Aug 2014 13:19:17 -0400, Bernhard Schornak
<scho...@nospicedham.web.de> wrote:
> On Wed, 20 Aug 2014 19:37:12 -0400, Rod Pemberton
> <dont_us...@nospicedham.xnothavet.cqm> wrote:

[our (mis)spellings]

>> It's a very fast transistion,

s/transistion/transition

Extra 's' Rod ...

> At high frequencies, capacities,

s/capasities/capacitances

> Signals are dampened and parasitary charges

s/parasitary/parasitic

(I'm presuming that's what you intended.)

RP

Rod Pemberton

unread,

Aug 22, 2014, 3:56:03 AM8/22/14

to

On Fri, 22 Aug 2014 02:03:12 -0400, George Neuner
<gneu...@nospicedham.comcast.net> wrote:
> On Tue, 19 Aug 2014 18:34:27 +0100, "James Harris"
> <james.h...@nospicedham.gmail.com> wrote:

[subject change]

>> What a CPU can do in a clock cycle depends on the hardware that
>> it has between storage elements.
>
> And just to throw more fuel on the fire ... over in comp.arch, Ivan
> Godard has been talking about a new chip design he has been working
> with that (at least theoretically) can execute 30+ instructions per
> cycle while consuming only DSP-like power. See http://ootbcomp.com/.
>

My question is: "Who is he?" ...

He seems to have come out of no where, mostly via
self-promotion, interviews, Youtube and Hackaday videos,
it seems, and took the computing world by storm over
his Mill architecture or belt processor a year or two
ago. Articles say his company has been working on it
for a decade. He's CEO/CTO of Out-of-the-Box Computing.
After a decade, why is he going public with this now?
Is he embracing open-source hardware?

It's clear from his posts to comp.compilers that
he does have some background in compilers. I haven't
been reading comp.arch.

This paragraph on him is a very interesting read:
http://www.ftpress.com/authors/bio/de5f140d-e5bf-4e83-b020-1db28a80b71c

From that, I'm not sure if he's an eclectic genius,
a nut, a huckster, or a fraud.

Nightclubs? Importer of rare cars? What the ...

And, there doesn't seem to be any info on whom he developed
the numerous compilers, instruction sets, and operating
system for. Were these hobby projects or corporate?

How does someone without degrees get to teach graduate
and doctorate level CS courses? ...

It seems the world is taken by the fact he looks
like Sir Ian McKellen as Gandolf in The Hobbit movies.
Am I wrong? Funny? Rude?

On the technical side, the "belt" processor seems to
have aspects of stack languages like Forth, and/or
the tape of a Turing machine, to me.

Rod Pemberton

Rod Pemberton

unread,

Aug 22, 2014, 4:15:24 AM8/22/14

to

On Fri, 22 Aug 2014 03:56:03 -0400, Rod Pemberton
<dont_us...@nospicedham.xnothavet.cqm> wrote:

> On the technical side, the "belt" processor seems to
> have aspects of stack languages like Forth, and/or
> the tape of a Turing machine, to me.
>

"[Ivan]‘s solution to this problem is replacing the
registers in a CPU with something called a ‘belt’ --
basically a weird combination of a stack and a
shift register."

http://hackaday.com/2013/08/02/the-mill-cpu-architecture/

Isn't that exactly how the 8087 works? ...
The 8087 has an 8 register LIFO stack.

The 8086 can directly access the stack via MOV,
without using PUSH or POP.

Rod Pemberton

Terje Mathisen

unread,

Aug 22, 2014, 4:55:09 AM8/22/14

to

Please take a few hours to watch several of the Mill presentations, the
belt is indeed a brand new concept.

However, it is the synergy between the belt-enabled automatic garbage
collection (i.e. register liveness), the memory status bits in the
cache(s) which allow most memory transactions to never actually touch
ram (NaR/None) and several other features that actually does seem to work.

Robert Wessel

unread,

Aug 22, 2014, 5:31:44 AM8/22/14

to

On Fri, 22 Aug 2014 04:15:24 -0400, "Rod Pemberton"

Not even close. The belt is nothing at all like the x87 stack. Unlike
the x87's stack, the belt is always "moving" in one direction, and
results naturally fall off the end after a given time. So a bit more
like a shift register, but it's not physically implemented as a shift
register, and things don't actually move one it (except logically).
The belt basically replaces registers for both sources and
destinations.

Personally I'm not convinced that the ILP claims are plausible, but
it's an interesting approach nonetheless.

Bernhard Schornak

unread,

Aug 22, 2014, 6:40:25 AM8/22/14

to

Rod Pemberton wrote:

> On Thu, 21 Aug 2014 13:19:17 -0400, Bernhard Schornak <scho...@nospicedham.web.de> wrote:
>> On Wed, 20 Aug 2014 19:37:12 -0400, Rod Pemberton <dont_us...@nospicedham.xnothavet.cqm> wrote:
>
> [our (mis)spellings]
>
>>> It's a very fast transistion,
>
> s/transistion/transition
>
> Extra 's' Rod ...
>
>> At high frequencies, capacities,
>
> s/capasities/capacitances

Yes, capacitance in pF. Our German term "Kapazit�t" either
means capacity or capacitance. I wasn't aware that English
has two different words for it. Such things happen if I do
not ask http://dict.leo.org/ for the proper translation. ;)

>> Signals are dampened and parasitary charges
>
> s/parasitary/parasitic
>
> (I'm presuming that's what you intended.)

Yes, again - "parasitic charges" => unwanted charges which
cannot be avoided nor suppressed completely.

Thanks for any correction - one never stops to learn!

Alexei A. Frounze

unread,

Aug 22, 2014, 9:39:33 AM8/22/14

to

On Friday, August 22, 2014 12:56:03 AM UTC-7, Rod Pemberton wrote:
...

[Ivan Godard]

> My question is: "Who is he?" ...
>
> He seems to have come out of no where, mostly via
> self-promotion, interviews, Youtube and Hackaday videos,
> it seems, and took the computing world by storm over
> his Mill architecture or belt processor a year or two
> ago. Articles say his company has been working on it
> for a decade. He's CEO/CTO of Out-of-the-Box Computing.
> After a decade, why is he going public with this now?
> Is he embracing open-source hardware?

He's saying that he's giving presentations when the appropriate patents have been filed, basically, when he feels safe to disclose his IP.

Other than that, I'm guessing he's still looking for extra money (investors) or a buyer. He's openly saying he's in it for money, which is nothing unusual. But it may have certain implications like we the Mill CPU will never see the market, if ever leave the fab.

Watch the videos or attend one of his talks. I've attended one at a local C/C++ meet-up event.

Alex

wolfgang kern

unread,

Aug 22, 2014, 3:19:42 PM8/22/14

to

Bernhard Schornak wrote:

>> This parallel Load-features became quite popular on modern CPUs of any
>> kind. So whenever a register (there is more than one RAX etc. inside)
>> is loaded, then a converted copy is present at the very same time.
>> If you look at TTL flipflops, you got two output-pins (Q and notQ).
>> So there is no time-difference between non- and -inverting gating.

> I agree with all points, but: Loading data into registers is
> done early in the execution chain - long before the register
> is sent to the execution pipeline. There are at least two or
> more clocks between decoding and executing each instruction.

This was just an example of what we did in TTL-(pre-MCU)-times.
btw: "registers were never sent!" they were just asked for contents. :)

> What I cannot agree with are *multiple* potential changes of
> one specific logic gate within one and the same clock cycle.
> This simply contradicts my knowledge and experience with any
> digital logic family I ever worked with.

Modern chip-landscape allow multiple (parallel connected) gating
of all inputs either IMM or [MEM] or even both and more.
A single transistor can be located as 'positive logic' or opposite,
and dual pairs are already implemented in almost all newer CPUs
even not neccessarirly mentioned in the user-manuals.

You can physically connect as many gates as you want or can afford
whithin one single clock-transition, and there are alot opportunities.

ie: a rising clock edge may trigger several actions at once.

And you can believe in that design-engineers today try their very best
to achieve top performance even on often merchant-limited costs.

servas
__
wolfgang

Bernhard Schornak

unread,

Aug 22, 2014, 4:47:56 PM8/22/14

to

wolfgang kern wrote:

> Bernhard Schornak wrote:
>
>>> This parallel Load-features became quite popular on modern CPUs of any
>>> kind. So whenever a register (there is more than one RAX etc. inside)
>>> is loaded, then a converted copy is present at the very same time.
>>> If you look at TTL flipflops, you got two output-pins (Q and notQ).
>>> So there is no time-difference between non- and -inverting gating.
>
>> I agree with all points, but: Loading data into registers is
>> done early in the execution chain - long before the register
>> is sent to the execution pipeline. There are at least two or
>> more clocks between decoding and executing each instruction.
>
> This was just an example of what we did in TTL-(pre-MCU)-times.
> btw: "registers were never sent!" they were just asked for contents. :)

Should have been "long before a register's content is sent".
Registers are physically located somewhere on the die - they
cannot move to another location, of course. ;)

>> What I cannot agree with are *multiple* potential changes of
>> one specific logic gate within one and the same clock cycle.
>> This simply contradicts my knowledge and experience with any
>> digital logic family I ever worked with.
>
> Modern chip-landscape allow multiple (parallel connected) gating
> of all inputs either IMM or [MEM] or even both and more.
> A single transistor can be located as 'positive logic' or opposite,
> and dual pairs are already implemented in almost all newer CPUs
> even not neccessarirly mentioned in the user-manuals.

CMOS generally uses complementary MOSFETs: One N- and one P-
channel MOSFET are tied together and work as a pair.

> You can physically connect as many gates as you want or can afford
> whithin one single clock-transition, and there are alot opportunities.
>
> ie: a rising clock edge may trigger several actions at once.

In active high logic, okay. In active low logic, the falling
edge triggers the active half of the clock signal.

> And you can believe in that design-engineers today try their very best
> to achieve top performance even on often merchant-limited costs.

For sure. If they don't, their competitors might gain market
shares...

George Neuner

unread,

Aug 22, 2014, 9:03:31 PM8/22/14

to

On Fri, 22 Aug 2014 09:09:17 +0200, Terje Mathisen
<terje.m...@nospicedham.tmsw.no> wrote:

>George Neuner wrote:
>>
>> And just to throw more fuel on the fire ... over in comp.arch, Ivan
>> Godard has been talking about a new chip design he has been working
>> with that (at least theoretically) can execute 30+ instructions per
>> cycle while consuming only DSP-like power. See http://ootbcomp.com/.
>
>You might have noticed that I have been part of that project since
>February?

Hi Terje,

I did know you were on comp.arch and aware of the Mill, but somehow I
missed that you had become part of the project.

>In fact, there is no real asm language defined for this architecture:
>Every single Mill model will have an individually optimized instruction
>set, using arithmetic coding/compression for maximum density.

In a former life I was part of something completely different 8-)

Back before GPUs became fashionable, I was with a group that developed
a programming system for a PCI based FPGA processing board. The board
was flexible: holding from 1 to 4 plug in FPGAs, available in a couple
of different sizes, and huge multi-banked memory. We developed a
common library of 2D and 3D array and image processing functions,
together with a compiler that enabled users to write high speed
programs targeting any subset of the available hardware. The user
would indicate how many FPGAs to use and the compiler automagically
handled FPGA (re)configuration, parameter setup and propagation of
results to dependent inputs further down the function chain, all in an
attempt to keep the chain going at best speed. Less than compiling
for your average OoO CPU, but way more than scripting.

It was fun while it lasted.

George

Rod Pemberton

unread,

Aug 23, 2014, 3:16:09 AM8/23/14

to

On Fri, 22 Aug 2014 05:31:44 -0400, Robert Wessel
<robert...@nospicedham.yahoo.com> wrote:
> On Fri, 22 Aug 2014 04:15:24 -0400, "Rod Pemberton"
> <dont_us...@nospicedham.xnothavet.cqm> wrote:
>> On Fri, 22 Aug 2014 03:56:03 -0400, Rod Pemberton
>> <dont_us...@nospicedham.xnothavet.cqm> wrote:

>>> On the technical side, the "belt" processor seems to
>>> have aspects of stack languages like Forth, and/or
>>> the tape of a Turing machine, to me.
>>>
>>
>> "[Ivan]‘s solution to this problem is replacing the
>> registers in a CPU with something called a ‘belt’ --
>> basically a weird combination of a stack and a
>> shift register."
>>
>> http://hackaday.com/2013/08/02/the-mill-cpu-architecture/
>>
>> Isn't that exactly how the 8087 works? ...
>> The 8087 has an 8 register LIFO stack.
>>
>> The 8086 can directly access the stack via MOV,
>> without using PUSH or POP.
>
> Not even close. The belt is nothing at all like the x87 stack. Unlike
> the x87's stack, the belt is always "moving" in one direction, and
> results naturally fall off the end after a given time. So a bit more
> like a shift register, but it's not physically implemented as a shift

> register, and things don't actually move [on] it (except logically).

> The belt basically replaces registers for both sources and
> destinations.

To me, that says ring or circular buffer.

Rod Pemberton

Terje Mathisen

unread,

Aug 23, 2014, 4:44:11 AM8/23/14

to

Bernhard Schornak wrote:
> wolfgang kern wrote:
>
>
>> Bernhard Schornak wrote:
>>
>>>> This parallel Load-features became quite popular on modern CPUs of any
>>>> kind. So whenever a register (there is more than one RAX etc. inside)
>>>> is loaded, then a converted copy is present at the very same time.
>>>> If you look at TTL flipflops, you got two output-pins (Q and notQ).
>>>> So there is no time-difference between non- and -inverting gating.
>>
>>> I agree with all points, but: Loading data into registers is
>>> done early in the execution chain - long before the register
>>> is sent to the execution pipeline. There are at least two or
>>> more clocks between decoding and executing each instruction.
>>
>> This was just an example of what we did in TTL-(pre-MCU)-times.
>> btw: "registers were never sent!" they were just asked for contents. :)
>
>
> Should have been "long before a register's content is sent".
> Registers are physically located somewhere on the die - they
> cannot move to another location, of course. ;)

Actually, even if that seems like an obvious conclusion, it is
effectively 70-80% wrong!

This is because a large percentage of all "register access" in reality
will be fulfilled by the bypass network, so it never makes it back to
the register file between the individual operations.

I.e. effectively dataflow even if it looks like reg-reg operations.

Terje

>
>
>>> What I cannot agree with are *multiple* potential changes of
>>> one specific logic gate within one and the same clock cycle.
>>> This simply contradicts my knowledge and experience with any
>>> digital logic family I ever worked with.
>>
>> Modern chip-landscape allow multiple (parallel connected) gating
>> of all inputs either IMM or [MEM] or even both and more.
>> A single transistor can be located as 'positive logic' or opposite,
>> and dual pairs are already implemented in almost all newer CPUs
>> even not neccessarirly mentioned in the user-manuals.
>
>
> CMOS generally uses complementary MOSFETs: One N- and one P-
> channel MOSFET are tied together and work as a pair.
>
>
>> You can physically connect as many gates as you want or can afford
>> whithin one single clock-transition, and there are alot opportunities.
>>
>> ie: a rising clock edge may trigger several actions at once.
>
>
> In active high logic, okay. In active low logic, the falling
> edge triggers the active half of the clock signal.
>
>
>> And you can believe in that design-engineers today try their very best
>> to achieve top performance even on often merchant-limited costs.
>
>
> For sure. If they don't, their competitors might gain market
> shares...
>
>
> Greetings from Augsburg
>
> Bernhard Schornak

Terje Mathisen

unread,

Aug 23, 2014, 4:46:41 AM8/23/14

to

Rod Pemberton wrote:
>> Not even close. The belt is nothing at all like the x87 stack. Unlike
>> the x87's stack, the belt is always "moving" in one direction, and
>> results naturally fall off the end after a given time. So a bit more
>> like a shift register, but it's not physically implemented as a shift
>> register, and things don't actually move [on] it (except logically).
>> The belt basically replaces registers for both sources and
>> destinations.
>
> To me, that says ring or circular buffer.

And that is the most likely physical implementation, with a current
pointer which indicates the beginning of the belt slots, i.e. where the
next result should drop.

Terje

George Neuner

unread,

Aug 24, 2014, 8:33:03 PM8/24/14

to

The *programming* model is a conveyor belt - ALU/FPU output get put on
at the beginning, values can be used as input from any position, and
at the end the values "fall off" and disappear if not deliberately
preserved by some means.

It solves the hard for compilers problem of determining when a
"long-lived" value is dead, so its register can be reused without
spilling/filling.

Terje will correct me I'm sure, but AIUI the hardware is a distributed
reservation bypass network with associative (re)naming of the
stations. There is no big shift register, or ring or array of
registers.

George

Terje Mathisen

unread,

Aug 25, 2014, 6:31:15 AM8/25/14

to

I'm pretty sure parts of the physical implementation is still in the
"Pending Patent filing" area, O I can't comment, except to say that
since the Mill has been stated to cover a huge range in preformance,
from tiny models with down to a single instruction per cycle, to the
huge Gold which is in fact capable of up to 30 instr/cycle.

In this range there is bound to be more than one way to solve this
particular problem. :-)

George Neuner

unread,

Aug 26, 2014, 1:44:23 AM8/26/14

to

On Mon, 25 Aug 2014 12:31:15 +0200, Terje Mathisen
<terje.m...@nospicedham.tmsw.no> wrote:

>I'm pretty sure parts of the physical implementation is still in the
>"Pending Patent filing" area, O I can't comment, except to say that
>since the Mill has been stated to cover a huge range in preformance,
>from tiny models with down to a single instruction per cycle, to the
>huge Gold which is in fact capable of up to 30 instr/cycle.

I'm rather positive that Ivan spoke about this aspect of the design in
the videos ... no specific detail about the actual implementation, but
adamant about what it wasn't.

>In this range there is bound to be more than one way to solve this
>particular problem. :-)

Absolutely. I'm aware of Ivan's concerns re: speculation. Based on
my own IP experience, I believe they are being oversold, but I do
understand them.

>Terje
George

Bernhard Schornak

unread,

Aug 27, 2014, 11:11:19 AM8/27/14

to

Rod Pemberton wrote:

> On Wed, 20 Aug 2014 11:49:31 -0400, Bernhard Schornak <scho...@nospicedham.web.de> wrote:
>

> [...]

>
> Why? You'd still want a way to indicate a change of state and
> steady state conditions. That what the clock does.

And what I told. Reading the linked Wiki pages, they suport
most of the arguments I cited "out of my head" - especially
my main concern about multiple state changes within one and
the same logic gate in a single clock cycle.

> It seems you keep coming back to this issue, like you're interested
> in having the electronics free-run as fast as possible without a
> clock. That's known as asynchronous. Asynchronous digital circuits
> have no clocking.
>
> http://en.wikipedia.org/wiki/Asynchronous_circuit

Yes. Their advantage is that we do not need a clock signal,
but this is a disadvantage for digital processing, as well:
They need extra hardware to synchronise them and avoid race
conditions (mentioned in the linked text).

I do not think it was possible to build asynchronuous units
capable to compete with current multicore processors. Maybe
in the far future, but not in the next decade.

>> *Multiple* potentials in one and the same clock cycle are possible
>> in analog circuits (e.g. the FET booster I developed for my guitars),
>> but never in digital ones.
>
> I'm pretty sure they've developed (two types) of three state logic:
>
> http://en.wikipedia.org/wiki/Three-state_logic
> http://en.wikipedia.org/wiki/Ternary_logic

Yes. The first is older than microprocessors, the latter is
hard to implement, because a third logic level was required
to make it work (e.g.: the middle between high and low). If
I remember correctly, attempts were made, but could not win
against (the less complicated) TTL.

Bernhard Schornak

unread,

Aug 27, 2014, 11:11:24 AM8/27/14

to

Terje Mathisen wrote:

> Bernhard Schornak wrote:
>> wolfgang kern wrote:
>>
>>> Bernhard Schornak wrote:
>>>
>>>>> This parallel Load-features became quite popular on modern CPUs of any
>>>>> kind. So whenever a register (there is more than one RAX etc. inside)
>>>>> is loaded, then a converted copy is present at the very same time.
>>>>> If you look at TTL flipflops, you got two output-pins (Q and notQ).
>>>>> So there is no time-difference between non- and -inverting gating.
>>>
>>>> I agree with all points, but: Loading data into registers is
>>>> done early in the execution chain - long before the register
>>>> is sent to the execution pipeline. There are at least two or
>>>> more clocks between decoding and executing each instruction.
>>>
>>> This was just an example of what we did in TTL-(pre-MCU)-times.
>>> btw: "registers were never sent!" they were just asked for contents. :)
>>
>> Should have been "long before a register's content is sent".
>> Registers are physically located somewhere on the die - they
>> cannot move to another location, of course. ;)
>
> Actually, even if that seems like an obvious conclusion, it is effectively 70-80% wrong!
>
> This is because a large percentage of all "register access" in reality will be fulfilled by the
> bypass network, so it never makes it back to the register file between the individual operations.
>
> I.e. effectively dataflow even if it looks like reg-reg operations.

Yes. This is true as long as data are 'pumped' through the
execution pipeline. Results at the end of the pipe have to
be stored somewhere if we don't want to introduce markable
inconsistencies, though.

If we did not and the task scheduler issued a task switch,
the results got lost and were no longer available when the
interrupted thread is continued on another core in another
time slice...

wolfgang kern

unread,

Aug 28, 2014, 5:35:57 AM8/28/14

to

Bernhard Schornak wrote:
...

>> This is because a large percentage of all "register access" in reality
>> will be fulfilled by the bypass network, so it never makes it back to the
>> register file between the individual operations.

>> I.e. effectively dataflow even if it looks like reg-reg operations.

> Yes. This is true as long as data are 'pumped' through the
> execution pipeline. Results at the end of the pipe have to
> be stored somewhere if we don't want to introduce markable
> inconsistencies, though.

Data within the CPU may rare move :)
I see the whole register-set including temporary in modern CPUs as
internal RAM-space where tag-gates just tell which location is now
which register.

__
wolfgang

Bernhard Schornak

unread,

Aug 29, 2014, 4:56:17 AM8/29/14

to

IIRC, the current FX design provides 160 integer registers which
are instances of the 16 integer registers 'we' know as assembler
progammers. Four execution pipes deliver some results per cycle,
and two simultaneously processed operations might change one and
the same register more than once in a single cycle.

On the other hand, the 160 registers must be more than just RAM,
because partial register writes introduce delays, see:

http://www.xiaohui.com/dev/mmx/mmx_p_19_1.htm

or

"Software Optimization Guide for AMD Family 15h Processors"
[47414 Rev. 3.08 January 2014], "5.5 Partial-Register Writes"

This was not the case with simple RAM...

Pfüat'Di

Bernhard

wolfgang kern

unread,

Aug 29, 2014, 4:25:02 PM8/29/14

to

Bernhard Schornak wrote:

...

> IIRC, the current FX design provides 160 integer registers which
> are instances of the 16 integer registers 'we' know as assembler
> progammers. Four execution pipes deliver some results per cycle,
> and two simultaneously processed operations might change one and
> the same register more than once in a single cycle.
>
> On the other hand, the 160 registers must be more than just RAM,
> because partial register writes introduce delays, see:

> http://www.xiaohui.com/dev/mmx/mmx_p_19_1.htm

ok, but:
Dont think in terms of what's possible, just take the obvious
behaviour from modern CPUs as a matter of fact.
ie:
MOV reg to reg ;time 0
Alter a reg by another reg ;time 0
Alter a reg by coded IMM ;time 0
Modify parts of a register ;may stall if this register is read soon.
and more...

> or
> "Software Optimization Guide for AMD Family 15h Processors"
> [47414 Rev. 3.08 January 2014], "5.5 Partial-Register Writes"
> This was not the case with simple RAM...

Yeah, I have this doc.
partial register stalls may came in if any previous or following
instruction alter or load the same register (but not more).

dont let you fool by HLL support features,
the CPU will never know which coding style is in progress.

CPU-internals can be learned from hardware-docs and here you may
find a lot of 'micro-ops' behind single cycle instructions.
So your view of the matter may change after reading this.
__
wolfgang

Bernhard Schornak

unread,

Aug 29, 2014, 5:14:48 PM8/29/14

to

wolfgang kern wrote:

> Bernhard Schornak wrote:
>

>> IIRC, the current FX design provides 160 integer registers which
>> are instances of the 16 integer registers 'we' know as assembler
>> progammers. Four execution pipes deliver some results per cycle,
>> and two simultaneously processed operations might change one and
>> the same register more than once in a single cycle.
>>
>> On the other hand, the 160 registers must be more than just RAM,
>> because partial register writes introduce delays, see:
>
>> http://www.xiaohui.com/dev/mmx/mmx_p_19_1.htm
>
> ok, but:
> Dont think in terms of what's possible, just take the obvious
> behaviour from modern CPUs as a matter of fact.
> ie:
> MOV reg to reg ;time 0
> Alter a reg by another reg ;time 0
> Alter a reg by coded IMM ;time 0
> Modify parts of a register ;may stall if this register is read soon.
> and more...

Where are these timings from? Data transfers in zero time are
physically impossible. ;)

(Even renaming a register costs at least one clock...)

>> "Software Optimization Guide for AMD Family 15h Processors"
>> [47414 Rev. 3.08 January 2014], "5.5 Partial-Register Writes"
>> This was not the case with simple RAM...
>
> Yeah, I have this doc.
> partial register stalls may came in if any previous or following
> instruction alter or load the same register (but not more).
>
> dont let you fool by HLL support features,
> the CPU will never know which coding style is in progress.

These are basic questions, there are no programming languages
involved. AMD - and iNTEL, as well - do not contemplate about
delays due to partial register writes about five and one half
pages, if doing so did not introduce problems.

The reason why I cited chapter 5.5 were my thoughts about the
nature of registers, though. They must be organised different
from simple RAM, otherwise we could alter single bits without
the fear of huge penalties for doing so.

> CPU-internals can be learned from hardware-docs and here you may
> find a lot of 'micro-ops' behind single cycle instructions.
> So your view of the matter may change after reading this.

Micro-ops - AMD called them 'complex', but meanwhile switched
to 'microcode' - are very time consuming and 'cost' more than
simple operations like "mov reg,imm" or "shr reg".

Even if a lot of tasks can be split into parts which might be
executed simultaneously in parallel units, some steps must be
ececuted after others, because they depend on results of pre-
vious steps. Hardware designers can reduce the time needed to
calculate intermediate data, but they can't omit any required
step completely.

Have a nice weekend!

Bernhard

wolfgang kern

unread,

Aug 30, 2014, 5:09:53 AM8/30/14

to

Bernhard Schornak wrote:

>>> IIRC, the current FX design provides 160 integer registers which
>>> are instances of the 16 integer registers 'we' know as assembler
>>> progammers. Four execution pipes deliver some results per cycle,
>>> and two simultaneously processed operations might change one and
>>> the same register more than once in a single cycle.

>>> On the other hand, the 160 registers must be more than just RAM,
>>> because partial register writes introduce delays, see:

>>> http://www.xiaohui.com/dev/mmx/mmx_p_19_1.htm

>> ok, but:
>> Dont think in terms of what's possible, just take the obvious
>> behaviour from modern CPUs as a matter of fact.
>> ie:
>> MOV reg to reg ;time 0
>> Alter a reg by another reg ;time 0
>> Alter a reg by coded IMM ;time 0
>> Modify parts of a register ;may stall if this register is read soon.
>> and more...

> Where are these timings from? Data transfers in zero time are
> physically impossible. ;)

Yes of course this actions take time as well, but because they may be
already decoded (instruction fetch spans several instructions) they
can be executed in parallel on the fly and so dont need extra time.

> (Even renaming a register costs at least one clock...)

clock-edges may just trigger a whole sequence of actions...

>>> "Software Optimization Guide for AMD Family 15h Processors"
>>> [47414 Rev. 3.08 January 2014], "5.5 Partial-Register Writes"
>>> This was not the case with simple RAM...

>> Yeah, I have this doc.
>> partial register stalls may came in if any previous or following
>> instruction alter or load the same register (but not more).

>> dont let you fool by HLL support features,
>> the CPU will never know which coding style is in progress.

> These are basic questions, there are no programming languages
> involved. AMD - and iNTEL, as well - do not contemplate about
> delays due to partial register writes about five and one half
> pages, if doing so did not introduce problems.

> The reason why I cited chapter 5.5 were my thoughts about the
> nature of registers, though. They must be organised different
> from simple RAM, otherwise we could alter single bits without
> the fear of huge penalties for doing so.

we can alter single bits anyway without fear :)
haven't checked timing on BT.. instructions recently, but what
I remember is that AMD had no timing issues with it.

>> CPU-internals can be learned from hardware-docs and here you may
>> find a lot of 'micro-ops' behind single cycle instructions.
>> So your view of the matter may change after reading this.

> Micro-ops - AMD called them 'complex', but meanwhile switched
> to 'microcode' - are very time consuming and 'cost' more than
> simple operations like "mov reg,imm" or "shr reg".

I meant the operational steps that can perform within one single
clock cycle.

> Even if a lot of tasks can be split into parts which might be
> executed simultaneously in parallel units, some steps must be
> ececuted after others, because they depend on results of pre-
> vious steps. Hardware designers can reduce the time needed to
> calculate intermediate data, but they can't omit any required
> step completely.

> Have a nice weekend!

same in return!
__
wolfgang

Bernhard Schornak

unread,

Aug 31, 2014, 12:40:15 PM8/31/14

to

Latencies can be 'hidden' behind other, more obvious operations,
but cannot be omitted completely. What is done needs the time to
do it. Only something never done requires no time to not perform
it. ;)

>>>> "Software Optimization Guide for AMD Family 15h Processors"
>>>> [47414 Rev. 3.08 January 2014], "5.5 Partial-Register Writes"
>>>> This was not the case with simple RAM...
>
>>> Yeah, I have this doc.
>>> partial register stalls may came in if any previous or following
>>> instruction alter or load the same register (but not more).
>
>>> dont let you fool by HLL support features,
>>> the CPU will never know which coding style is in progress.
>
>> These are basic questions, there are no programming languages
>> involved. AMD - and iNTEL, as well - do not contemplate about
>> delays due to partial register writes about five and one half
>> pages, if doing so did not introduce problems.
>
>> The reason why I cited chapter 5.5 were my thoughts about the
>> nature of registers, though. They must be organised different
>> from simple RAM, otherwise we could alter single bits without
>> the fear of huge penalties for doing so.
>
> we can alter single bits anyway without fear :)
> haven't checked timing on BT.. instructions recently, but what
> I remember is that AMD had no timing issues with it.

Nowadays, data storage is organised in 64 bit units (or larger).
Whether it is RAM, on-die cache or a register: Changing a single
byte inside this basic unit always requires a merge operation.

Pfüat'Di

Bernhard

DSF

unread,

Sep 3, 2014, 3:27:39 PM9/3/14

to

On Mon, 18 Aug 2014 09:27:02 +0200, Terje Mathisen
<terje.m...@nospicedham.tmsw.no> wrote:

>DSF wrote:
>> The code is a replacement for the generic strstr and therefore
>> should not be optimized for any particular situation. Source and
>> search strings need not contain words, English or otherwise. A better
>> way to look at it is that it searches "source" (a zero-terminated
>> character array) for "search" (another zero-terminated character
>> array).
>
>You can avoid the explicit strlen calls if you combine the search for
>the initial char with the scan for a terminating zero, but that would
>mean that you could not use REPNE SCASB to locate the first matching
>char. I.e. you need to know the max length.
>
>Given that you start with a REPNE SCASB to get the length of the buffer
>to be searched, I would simply stick with the naive code:
>
>REPNE SCASB to locate possible matches, followed by REPE CMPSB to check
>for a full match.

But I never said I use REPNE SCASB to get the length of the buffer.
<BG> Several lifetimes ago, I posted to this group to learn about
processing strings in bigger gulps. You helped me greatly and one of
the final results was dstrlenda_asm. ('d'-to stay out of the
standard's way, strlen and strend combined, 'a'-for 8-bit
characters,'_asm'-assembly callable. There is a dstrlendA() for C.) It
uses GPR or SSE2 if available. It also only uses eax, ecx and edi so
it can be used just about anywhere the REPNE SCASB routine can be
used. Upon return, ecx contains the length of the string in
characters (not bytes) and edi points to the ending zero. The only
difference is you cannot depend on al being zero. Thanks again for
the help!

>If you want to be more fancy, then you should look hard at STRCMPI which
>is more or less optimized to do this type of work:

I've not heard of STRCMPI (all caps). Google produced nothing. My
Borland compiler has a strcmpi (non-portable) and stricmp (portable).
If you did mean stricmp (Shudder!), on my system stricmp is 300
times slower that strcmp. strcmp is written in assembler and uses the
cmpsb code. stricmp is written in C and converts each character of
both strings using toupper() before comparing them. (toupper() calls
_toupper() which performs a test and calls __toupper() which sets a
variable and calls ___toupper()...and then has the current locale to
deal with.)

>
>Look at 16 byte (or 8 short) values at the same time, for up to 16/8
>different target chars.
>>
>> In my current instance, at this time, I am parsing lines of HTML
>> code for particular phrases. Source can have a length of very few
>> characters to thousands and contain many "non-words." (This is why I
>> made the above changes. To reduce the possible thousands of calls to
>> rep cmpsb.)
>
>For your current code I would do things quite differently:
>
>With a big target buffer (html file) and a limited number of key words
>to look for, again and again, it makes a _lot_ of sense to switch to
>Boyer-Moore instead!
>
>Take the fixed hit of setting up a 256-byte skip table for each pattern
>to search for, then blow through the actual searches.
>
>The main headache with HTML parsing isn't finding the key words, but to
>locate the word delimiters: Any kind of whitespace, plus several
>context-sensitive characters can delimit the current token.

Appreciate the advice, but I am actually not literally parsing the
HTML. I am only interested in finding certain fixed strings in the
text which precede the data I'm after. Most of these strings are
short and, with what I've read about Boyer-Moore it becomes less
efficient with short search strings. This did lead me to contemplate
leaving dstrstr be and designing custom string search code for this
particular application. (A problem with a utility I wrote and use
often has taken me away from this project for a while.) I think a
couple of the search strings may (individually) fit in a single 32-bit
register.

>Terje

DSF
"'Later' is the beginning of what's not to be."
D.S. Fiscus

Terje Mathisen

unread,

Sep 3, 2014, 4:42:46 PM9/3/14

to

DSF wrote:
> On Mon, 18 Aug 2014 09:27:02 +0200, Terje Mathisen

>> If you want to be more fancy, then you should look hard at STRCMPI which
>> is more or less optimized to do this type of work:
>
> I've not heard of STRCMPI (all caps). Google produced nothing. My

Hmmm....sorry!

PCMPISTRI is the instruction I was looking for, it actually does 256
byte-by-byte comparisons simultaneously!

BTDT. :-)

You should still write at least one BM implementation, it is really
quite educational. :-)

The code code can look like this (for byte variables):

again:
REPM UNROLL_FACTOR
movzx ebx,byte ptr [esi]
movzx ebx,byte ptr skiptable[ebx] ;; movzx ebx, byte ptr [edx+ebx]
add esi,ebx
ENDM
test ebx,ebx
jz found_start
cmp esi,ecx ;; ECX has buffer end minus guard distance
jb again

Each time you hit a character which doesn't exist in you search pattern
you get to skip a distance equal to the pattern length, when you get a
possible hit the skip value is zero so that you can iterate in place,
allowing the unrolling to work without intermediate checks.

DSF

unread,

Sep 3, 2014, 7:43:00 PM9/3/14

to

On Wed, 03 Sep 2014 22:27:41 +0200, Terje Mathisen
<terje.m...@nospicedham.tmsw.no> wrote:

>> I've not heard of STRCMPI (all caps). Google produced nothing. My
>
>Hmmm....sorry!
>
>PCMPISTRI is the instruction I was looking for, it actually does 256
>byte-by-byte comparisons simultaneously!
>

>Terje

I have comments on some of what I deleted, but I wanted to get this
out of the way.

I haven't been able to conclusively prove it yet, but is PCMPISTRI
an Intel-only instruction? I notice it's SSE42, which is not
available on the AMD Phenom hex core machine I'm typing this message
on. None of the online spots I Googled mentioned anything other than
Intel. And I'm going to guess one needs a relatively new Intel CPU as
well.

Bernhard Schornak

unread,

Sep 4, 2014, 7:03:17 AM9/4/14

to

DSF wrote:

> I haven't been able to conclusively prove it yet, but is PCMPISTRI
> an Intel-only instruction? I notice it's SSE42, which is not
> available on the AMD Phenom hex core machine I'm typing this message
> on. None of the online spots I Googled mentioned anything other than
> Intel. And I'm going to guess one needs a relatively new Intel CPU as
> well.

It is available on AMD processors family 15h, model 2 (bulldozers
*3**, e.g.: 6300) and 10h...4Fh (Opterons?). However, string pro-
cessing is quite time consuming (AMD specs):

PCMPESTRI 30 clocks
PCMPESTRM ? clocks
PCMPISTRI 10 clocks
PCMPISTRM ? clocks

Terje Mathisen

unread,

Sep 4, 2014, 7:18:21 AM9/4/14

to

Ouch!

I'm almost certain this is a fast (1-2 clocks?) operation on Intel, the
timings above means that AMD have synthesized it via microcode.

It is really strange that the Implicit length (i.e. '\0' terminated C
strings) version is three times faster than the explicit length version,
since the I version has to compare against zero as the same time as the
search pattern bytes.

Bernhard Schornak

unread,

Sep 4, 2014, 8:03:26 AM9/4/14

to

Terje Mathisen wrote:

> Bernhard Schornak wrote:
>> DSF wrote:
>>
>>
>>> I haven't been able to conclusively prove it yet, but is PCMPISTRI
>>> an Intel-only instruction? I notice it's SSE42, which is not
>>> available on the AMD Phenom hex core machine I'm typing this message
>>> on. None of the online spots I Googled mentioned anything other than
>>> Intel. And I'm going to guess one needs a relatively new Intel CPU as
>>> well.
>>
>>
>> It is available on AMD processors family 15h, model 2 (bulldozers
>> *3**, e.g.: 6300) and 10h...4Fh (Opterons?). However, string pro-
>> cessing is quite time consuming (AMD specs):
>>
>> PCMPESTRI 30 clocks
>> PCMPESTRM ? clocks
>> PCMPISTRI 10 clocks
>> PCMPISTRM ? clocks
>
> Ouch!
>
> I'm almost certain this is a fast (1-2 clocks?) operation on Intel, the timings above means that AMD
> have synthesized it via microcode.

Yes ("?" = "not available").

> It is really strange that the Implicit length (i.e. '\0' terminated C strings) version is three
> times faster than the explicit length version, since the I version has to compare against zero as
> the same time as the search pattern bytes.

iNTEL's optimisation guide says

PCMPESTRI 7 clocks
PCMPESTRM 8 clocks
PCMPISTRI 7 clocks
PCMPISTRM 8 clocks

However, iNTEL lists some operations with 0.33 or 0.5 clocks,
so their latencies should be verified on real hardware. ;)

To determine string lengths or perform simple comparisons, it
probably is faster to use PCMPEQ*.

Melzzzzz

unread,

Sep 4, 2014, 9:18:33 AM9/4/14

to

On Intel implicit length version is twice slower, too.

>
> Terje
>

--
press any key to continue or any other to quit...

Melzzzzz

unread,

Sep 4, 2014, 9:18:35 AM9/4/14

to

Not true. I have measured and pcmpestri on Intel and is twice as slow.

Melzzzzz

unread,

Sep 4, 2014, 10:18:40 AM9/4/14

to

On Thu, 4 Sep 2014 15:08:45 +0200
Melzzzzz <m...@nospicedham.zzzzz.invalid> wrote:

>
> On Intel implicit length version is twice slower, too.

I wanted to say `explicit`.

Bernhard Schornak

unread,

Sep 4, 2014, 10:18:42 AM9/4/14

to

As expected. These are very complex tests and comparisons,
so it needs some time to perform all of them. It only pays
off if most delivered results are required. To test single
matches, PCMPEQ*/PCMPGT* are the better choice.

DSF

unread,

Sep 12, 2014, 3:35:56 PM9/12/14

to

On Wed, 30 Jul 2014 03:05:05 -0400, DSF <nota...@address.here>
wrote:

Hello, group!

Normally it bugs me when I'm looking for information on Usenet and a
thread shifts to a different topic. You would think more so for my
own post. But I've enjoyed the hardware discussion that's gone on
here.

Reminds me of my early "programming" days: building projects with
74xx series chips. I would use wire wrap wire looped and soldered to
the ICs, or standard socket, depending on the cost of the chip. (The
price of wire wrap sockets being the main reason I wasn't using the
wire as intended.)

In my early days of programming, I realized I was doing much the
same thing, with the exception of a logic error not leading to hours
of rewiring (at best), or watching $6 (a lot of money for a teenager
back then) of chips go up in smoke.

Thanks,

Phil Carmody

unread,

Sep 15, 2014, 2:29:18 PM9/15/14

to

Terje Mathisen <terje.m...@nospicedham.tmsw.no> writes:
> Steve wrote:
> > "Rod Pemberton" <dont_us...@nospicedham.xnothavet.cqm> writes:
> >>> Was there a time when add was faster than sub?
> >>
> >> For the 386, 486, and Pentium, that would be: "No." "add" and "sub"
> >> have the same official timings on each respective processor. I don't
> >> have official 286 or 86 (or 186) timings in my notes or more recent
> >> processors. I'd suspect they're the same on each of the earlier
> >> processors too.
> >
> > Hi,
> >
> > All my references (including dead tree versions from Intel and AMD)
> > have ADD and SUB with the same timings for the 8086/88, 80186/188,
> > and 80286 processors.
>
> When you look at the actual HW, ADD and SUB is pretty much always the
> same operation, just a bit toggle on input:
>
> a - b
>
> is the same as
>
> a + (-b)
>
> where (-b) is (b ^ -1) + 1 (i.e. invert all the bits, then add 1)
>
> so we get
>
> a + (b ^ -1) + 1
>
> I.e. an adder can subtract by inverting all the bits in the second
> operand and set the incoming carry flag for the low bit.

Whilst this already is a wonderful insight, I think the cherry on top is
that it doesn't mess with, or get messed up by, the differences in SUB
and SUBC behaviour. If the carry (borrow) bit is set, then the desired
outcome is a + ~b + 1 - 1. So the carry bit, like the b operand, just
gets inverted before use.

> This means that you'll never see a cpu where ADD & SUB have different
> timing, or at least not one with two's complement binary numbers.
> :-)

Unless the negation adds another stage to the pipeline, or is even
done in microcode. Never underestimate the capability for the non-
obvious.

Phil
--
The best part of re-inventing the wheel is that you get to pick how
many sides the new one has. -- egcagrac0 on SoylentNews