Russell Marks <zgedneil@spam^H^H^H^Hgmail.com> wrote:
>>>>> > On Wednesday, September 9, 2020 at 11:09:35 AM UTC+10, Tony Nicholson wrote:
>>> [...]
>>>>> >>
https://github.com/agn453/UNZIP-CPM-Z80
[...]
>>> I have to say - maybe directing this more at Tony/Martin? - just
>>> looking at the code generally my first thought on optimising for speed
Here we go again. :-) I thought a constant in the degzip_portable.c
that the deflate code is based on looked familiar, and it is - so I
ported over the table-based CRC code from that. Now this is
"expensive" as the table is 1k long and pushes the COM file slightly
past 5k, but extracting my deflate test zip is 32% quicker with this
(combined with my previous changes) as compared with UNZIP152. It
would be possible to construct the table at runtime of course, but on
a Z80 I imagine a precalculated table might be for the best.
(While I'm posting I may as well note that I ran the version *before*
this table-CRC change against every zip on the Walnut Creek CP/M CD
earlier on, with no CRC errors. Obviously the goal there was to check
the non-deflate code is still working ok, and it seems to be.)
Here's the overall patch against UNZIP152.Z80:
--------------- unzip152-rdbybits-and-crc32tab.diff ---------------
--- UNZIP152.Z80 2020-10-13 16:02:05.906861502 +0100
+++ unzip152-new3.z80 2020-10-15 02:27:13.342645445 +0100
@@ -716,34 +755,37 @@
ld (hl),a
ret
;
-updcrc: ld hl,(crc32)
+; based on this from crc32() in degzip_portable.c:
+; for (i = 0; i < len; i++)
+; crc = crc32_tab[(uint8_t)(crc ^ *b++)] ^ (crc >> 8);
+;
+updcrc: ld bc,(crc32)
+ xor c ; A=low byte of crc xor output byte
+ ld h,0
+ ld l,a
+ add hl,hl ; *2
+ add hl,hl ; *4
+ ld de,crc32tab
+ add hl,de
ld de,(crc32 + 2)
+ ; now DEBC is "crc", and HL points to low byte of
+ ; relevant crc32tab entry. Do the xor with "crc"/256,
+ ; starting from the low bytes.
+ ld a,(hl)
+ xor b
ld c,a
- ld b,8
-crclp: ld a,l
- xor c
- srl c
- srl d
- rr e
- rr h
- rr l
- rra
- jr nc,noxor
- ld a,d
- xor 0edh
- ld d,a
- ld a,e
- xor 0b8h
+ inc hl
+ ld a,(hl)
+ xor e
+ ld b,a
+ inc hl
+ ld a,(hl)
+ xor d
ld e,a
- ld a,h
- xor 83h
- ld h,a
- ld a,l
- xor 20h
- ld l,a
-noxor: djnz crclp
- ld (crc32),hl
+ inc hl
+ ld d,(hl) ; high byte is a simple copy
ld (crc32 + 2),de
+ ld (crc32),bc
ret
;
unshrink:
@@ -984,7 +1026,7 @@
lflp: push bc
push hl
ld a,6
- call readbits
+ call rdbybits
pop hl
pop de
ld (hl),a
@@ -999,7 +1041,7 @@
ldfllp: push hl
push bc
ld a,8
- call readbits
+ call rdbybits
pop bc
pop hl
ld (hl),a
@@ -1035,11 +1077,11 @@
or a
jr nz,ur2
ur4: ld a,8
- call readbits
+ call rdbybits
jr ur3
;
ur2: ld a,1
- call readbits
+ call rdbybits
dec l
jr z,ur4
call slenlch
@@ -1073,7 +1115,7 @@
ld a,l
or a
jr z,ur10
- ld (V),a
+ ld (urV),a
ld a,(L_table)
ld h,a
and l
@@ -1106,7 +1148,7 @@
jr nz,ur13
ld a,(D_shift)
ld b,a
- ld a,(V)
+ ld a,(urV)
ur14: srl a
djnz ur14
ld h,a
@@ -1191,7 +1233,7 @@
;
readlengths:
ld a,8
- call readbits
+ call rdbybits
ld d,h
ld e,d
inc hl
@@ -1211,11 +1253,11 @@
push de
push hl
ld a,4
- call readbits
+ call rdbybits
inc a
push af
ld a,4
- call readbits
+ call rdbybits
inc a
ld b,a
pop af
@@ -1412,7 +1454,7 @@
push de
push bc
ld a,1
- call readbits
+ call rdbybits
pop af
push af
or a
@@ -1487,7 +1529,7 @@
jr ui4
;
ui3: ld a,8
- call readbits
+ call rdbybits
ui4: call outb
jr ui1
;
@@ -1512,7 +1554,7 @@
jr nz,ui6
push hl
ld a,8
- call readbits
+ call rdbybits
pop de
add hl,de
ui6: ld de,(mml)
@@ -1529,7 +1571,7 @@
ld (treep),hl
nsloop: push hl
ld a,1
- call readbits
+ call rdbybits
pop hl
or a
jr z,nsleft
@@ -1730,19 +1772,19 @@
;
huffman:
ld a,5
- call readbits
+ call rdbybits
inc a
ld l,a
ld h,1
ld (hlit),hl
ld a,5
- call readbits
+ call rdbybits
inc a
ld (hdist),a
ld a,4
- call readbits
+ call rdbybits
add a,4
ld c,a
@@ -1754,7 +1796,7 @@
push bc
push de
ld a,3
- call readbits
+ call rdbybits
pop hl
ld c,(hl)
ld b,0
@@ -1805,7 +1847,7 @@
cp 010h
jr nz,hmn16
ld a,2
- call readbits
+ call rdbybits
pop hl
pop bc
add a,3
@@ -1823,7 +1865,7 @@
hmn16: cp 011h
jr nz,hmn17
ld a,3
- call readbits
+ call rdbybits
pop hl
pop bc
add a,3
@@ -1839,7 +1881,7 @@
hmn17: cp 012h
jr nz,hmn18
ld a,7
- call readbits
+ call rdbybits
pop hl
pop bc
add a,11
@@ -1965,11 +2007,11 @@
ret nz
ld a,1
- call readbits
+ call rdbybits
push af
ld a,2
- call readbits
+ call rdbybits
or a
jr nz,udnt0
@@ -2125,10 +2167,10 @@
counting:
db 0
init:
- db 0
- db 0
- dw 0,0
- dw -1,-1
+ db 0 ; for bleft
+ db 0 ; for wrtpt
+ dw 0,0 ; for outpos
+ dw -1,-1 ; for crc32
endinit:
inbufp: dw 0080h
readpt: db 80h
@@ -2211,6 +2253,135 @@
db 06dh, 0dbh, 0b6h, 06dh, 0dbh, 0b6h, 0cdh, 0dbh
db 0b6h, 06dh, 0dbh, 0b6h, 06dh, 0dbh, 0a8h, 06dh
db 0ceh, 08bh, 06dh, 03bh
+crc32tab: ; crc32_tab[] from degzip_portable.c, takes 1k
+ db 000h,000h,000h,000h,096h,030h,007h,077h
+ db 02Ch,061h,00Eh,0EEh,0BAh,051h,009h,099h
+ db 019h,0C4h,06Dh,007h,08Fh,0F4h,06Ah,070h
+ db 035h,0A5h,063h,0E9h,0A3h,095h,064h,09Eh
+ db 032h,088h,0DBh,00Eh,0A4h,0B8h,0DCh,079h
+ db 01Eh,0E9h,0D5h,0E0h,088h,0D9h,0D2h,097h
+ db 02Bh,04Ch,0B6h,009h,0BDh,07Ch,0B1h,07Eh
+ db 007h,02Dh,0B8h,0E7h,091h,01Dh,0BFh,090h
+ db 064h,010h,0B7h,01Dh,0F2h,020h,0B0h,06Ah
+ db 048h,071h,0B9h,0F3h,0DEh,041h,0BEh,084h
+ db 07Dh,0D4h,0DAh,01Ah,0EBh,0E4h,0DDh,06Dh
+ db 051h,0B5h,0D4h,0F4h,0C7h,085h,0D3h,083h
+ db 056h,098h,06Ch,013h,0C0h,0A8h,06Bh,064h
+ db 07Ah,0F9h,062h,0FDh,0ECh,0C9h,065h,08Ah
+ db 04Fh,05Ch,001h,014h,0D9h,06Ch,006h,063h
+ db 063h,03Dh,00Fh,0FAh,0F5h,00Dh,008h,08Dh
+ db 0C8h,020h,06Eh,03Bh,05Eh,010h,069h,04Ch
+ db 0E4h,041h,060h,0D5h,072h,071h,067h,0A2h
+ db 0D1h,0E4h,003h,03Ch,047h,0D4h,004h,04Bh
+ db 0FDh,085h,00Dh,0D2h,06Bh,0B5h,00Ah,0A5h
+ db 0FAh,0A8h,0B5h,035h,06Ch,098h,0B2h,042h
+ db 0D6h,0C9h,0BBh,0DBh,040h,0F9h,0BCh,0ACh
+ db 0E3h,06Ch,0D8h,032h,075h,05Ch,0DFh,045h
+ db 0CFh,00Dh,0D6h,0DCh,059h,03Dh,0D1h,0ABh
+ db 0ACh,030h,0D9h,026h,03Ah,000h,0DEh,051h
+ db 080h,051h,0D7h,0C8h,016h,061h,0D0h,0BFh
+ db 0B5h,0F4h,0B4h,021h,023h,0C4h,0B3h,056h
+ db 099h,095h,0BAh,0CFh,00Fh,0A5h,0BDh,0B8h
+ db 09Eh,0B8h,002h,028h,008h,088h,005h,05Fh
+ db 0B2h,0D9h,00Ch,0C6h,024h,0E9h,00Bh,0B1h
+ db 087h,07Ch,06Fh,02Fh,011h,04Ch,068h,058h
+ db 0ABh,01Dh,061h,0C1h,03Dh,02Dh,066h,0B6h
+ db 090h,041h,0DCh,076h,006h,071h,0DBh,001h
+ db 0BCh,020h,0D2h,098h,02Ah,010h,0D5h,0EFh
+ db 089h,085h,0B1h,071h,01Fh,0B5h,0B6h,006h
+ db 0A5h,0E4h,0BFh,09Fh,033h,0D4h,0B8h,0E8h
+ db 0A2h,0C9h,007h,078h,034h,0F9h,000h,00Fh
+ db 08Eh,0A8h,009h,096h,018h,098h,00Eh,0E1h
+ db 0BBh,00Dh,06Ah,07Fh,02Dh,03Dh,06Dh,008h
+ db 097h,06Ch,064h,091h,001h,05Ch,063h,0E6h
+ db 0F4h,051h,06Bh,06Bh,062h,061h,06Ch,01Ch
+ db 0D8h,030h,065h,085h,04Eh,000h,062h,0F2h
+ db 0EDh,095h,006h,06Ch,07Bh,0A5h,001h,01Bh
+ db 0C1h,0F4h,008h,082h,057h,0C4h,00Fh,0F5h
+ db 0C6h,0D9h,0B0h,065h,050h,0E9h,0B7h,012h
+ db 0EAh,0B8h,0BEh,08Bh,07Ch,088h,0B9h,0FCh
+ db 0DFh,01Dh,0DDh,062h,049h,02Dh,0DAh,015h
+ db 0F3h,07Ch,0D3h,08Ch,065h,04Ch,0D4h,0FBh
+ db 058h,061h,0B2h,04Dh,0CEh,051h,0B5h,03Ah
+ db 074h,000h,0BCh,0A3h,0E2h,030h,0BBh,0D4h
+ db 041h,0A5h,0DFh,04Ah,0D7h,095h,0D8h,03Dh
+ db 06Dh,0C4h,0D1h,0A4h,0FBh,0F4h,0D6h,0D3h
+ db 06Ah,0E9h,069h,043h,0FCh,0D9h,06Eh,034h
+ db 046h,088h,067h,0ADh,0D0h,0B8h,060h,0DAh
+ db 073h,02Dh,004h,044h,0E5h,01Dh,003h,033h
+ db 05Fh,04Ch,00Ah,0AAh,0C9h,07Ch,00Dh,0DDh
+ db 03Ch,071h,005h,050h,0AAh,041h,002h,027h
+ db 010h,010h,00Bh,0BEh,086h,020h,00Ch,0C9h
+ db 025h,0B5h,068h,057h,0B3h,085h,06Fh,020h
+ db 009h,0D4h,066h,0B9h,09Fh,0E4h,061h,0CEh
+ db 00Eh,0F9h,0DEh,05Eh,098h,0C9h,0D9h,029h
+ db 022h,098h,0D0h,0B0h,0B4h,0A8h,0D7h,0C7h
+ db 017h,03Dh,0B3h,059h,081h,00Dh,0B4h,02Eh
+ db 03Bh,05Ch,0BDh,0B7h,0ADh,06Ch,0BAh,0C0h
+ db 020h,083h,0B8h,0EDh,0B6h,0B3h,0BFh,09Ah
+ db 00Ch,0E2h,0B6h,003h,09Ah,0D2h,0B1h,074h
+ db 039h,047h,0D5h,0EAh,0AFh,077h,0D2h,09Dh
+ db 015h,026h,0DBh,004h,083h,016h,0DCh,073h
+ db 012h,00Bh,063h,0E3h,084h,03Bh,064h,094h
+ db 03Eh,06Ah,06Dh,00Dh,0A8h,05Ah,06Ah,07Ah
+ db 00Bh,0CFh,00Eh,0E4h,09Dh,0FFh,009h,093h
+ db 027h,0AEh,000h,00Ah,0B1h,09Eh,007h,07Dh
+ db 044h,093h,00Fh,0F0h,0D2h,0A3h,008h,087h
+ db 068h,0F2h,001h,01Eh,0FEh,0C2h,006h,069h
+ db 05Dh,057h,062h,0F7h,0CBh,067h,065h,080h
+ db 071h,036h,06Ch,019h,0E7h,006h,06Bh,06Eh
+ db 076h,01Bh,0D4h,0FEh,0E0h,02Bh,0D3h,089h
+ db 05Ah,07Ah,0DAh,010h,0CCh,04Ah,0DDh,067h
+ db 06Fh,0DFh,0B9h,0F9h,0F9h,0EFh,0BEh,08Eh
+ db 043h,0BEh,0B7h,017h,0D5h,08Eh,0B0h,060h
+ db 0E8h,0A3h,0D6h,0D6h,07Eh,093h,0D1h,0A1h
+ db 0C4h,0C2h,0D8h,038h,052h,0F2h,0DFh,04Fh
+ db 0F1h,067h,0BBh,0D1h,067h,057h,0BCh,0A6h
+ db 0DDh,006h,0B5h,03Fh,04Bh,036h,0B2h,048h
+ db 0DAh,02Bh,00Dh,0D8h,04Ch,01Bh,00Ah,0AFh
+ db 0F6h,04Ah,003h,036h,060h,07Ah,004h,041h
+ db 0C3h,0EFh,060h,0DFh,055h,0DFh,067h,0A8h
+ db 0EFh,08Eh,06Eh,031h,079h,0BEh,069h,046h
+ db 08Ch,0B3h,061h,0CBh,01Ah,083h,066h,0BCh
+ db 0A0h,0D2h,06Fh,025h,036h,0E2h,068h,052h
+ db 095h,077h,00Ch,0CCh,003h,047h,00Bh,0BBh
+ db 0B9h,016h,002h,022h,02Fh,026h,005h,055h
+ db 0BEh,03Bh,0BAh,0C5h,028h,00Bh,0BDh,0B2h
+ db 092h,05Ah,0B4h,02Bh,004h,06Ah,0B3h,05Ch
+ db 0A7h,0FFh,0D7h,0C2h,031h,0CFh,0D0h,0B5h
+ db 08Bh,09Eh,0D9h,02Ch,01Dh,0AEh,0DEh,05Bh
+ db 0B0h,0C2h,064h,09Bh,026h,0F2h,063h,0ECh
+ db 09Ch,0A3h,06Ah,075h,00Ah,093h,06Dh,002h
+ db 0A9h,006h,009h,09Ch,03Fh,036h,00Eh,0EBh
+ db 085h,067h,007h,072h,013h,057h,000h,005h
+ db 082h,04Ah,0BFh,095h,014h,07Ah,0B8h,0E2h
+ db 0AEh,02Bh,0B1h,07Bh,038h,01Bh,0B6h,00Ch
+ db 09Bh,08Eh,0D2h,092h,00Dh,0BEh,0D5h,0E5h
+ db 0B7h,0EFh,0DCh,07Ch,021h,0DFh,0DBh,00Bh
+ db 0D4h,0D2h,0D3h,086h,042h,0E2h,0D4h,0F1h
+ db 0F8h,0B3h,0DDh,068h,06Eh,083h,0DAh,01Fh
+ db 0CDh,016h,0BEh,081h,05Bh,026h,0B9h,0F6h
+ db 0E1h,077h,0B0h,06Fh,077h,047h,0B7h,018h
+ db 0E6h,05Ah,008h,088h,070h,06Ah,00Fh,0FFh
+ db 0CAh,03Bh,006h,066h,05Ch,00Bh,001h,011h
+ db 0FFh,09Eh,065h,08Fh,069h,0AEh,062h,0F8h
+ db 0D3h,0FFh,06Bh,061h,045h,0CFh,06Ch,016h
+ db 078h,0E2h,00Ah,0A0h,0EEh,0D2h,00Dh,0D7h
+ db 054h,083h,004h,04Eh,0C2h,0B3h,003h,039h
+ db 061h,026h,067h,0A7h,0F7h,016h,060h,0D0h
+ db 04Dh,047h,069h,049h,0DBh,077h,06Eh,03Eh
+ db 04Ah,06Ah,0D1h,0AEh,0DCh,05Ah,0D6h,0D9h
+ db 066h,00Bh,0DFh,040h,0F0h,03Bh,0D8h,037h
+ db 053h,0AEh,0BCh,0A9h,0C5h,09Eh,0BBh,0DEh
+ db 07Fh,0CFh,0B2h,047h,0E9h,0FFh,0B5h,030h
+ db 01Ch,0F2h,0BDh,0BDh,08Ah,0C2h,0BAh,0CAh
+ db 030h,093h,0B3h,053h,0A6h,0A3h,0B4h,024h
+ db 005h,036h,0D0h,0BAh,093h,006h,0D7h,0CDh
+ db 029h,057h,0DEh,054h,0BFh,067h,0D9h,023h
+ db 02Eh,07Ah,066h,0B3h,0B8h,04Ah,061h,0C4h
+ db 002h,01Bh,068h,05Dh,094h,02Bh,06Fh,02Ah
+ db 037h,0BEh,00Bh,0B4h,0A1h,08Eh,00Ch,0C3h
+ db 01Bh,0DFh,005h,05Ah,08Dh,0EFh,002h,02Dh
;
; uninitialized storage
;
@@ -2237,6 +2408,7 @@
ds 24
mtchfcb:
ds 11
+; note that as indicated above, bitbuf must be the byte before bleft
bitbuf: ds 1
vars:
bleft: ds 1
@@ -2250,7 +2422,7 @@
ds 1
D_shift:
ds 1
-V: ds 1
+urV: ds 1
nchar: ds 1
lchar: ds 1
ExState:
@@ -2311,5 +2483,5 @@
disttr: ds 4 * nrdist
endtr:
ds 8192 + 2 - (endtr - lenld)
-
+endaddr: ; must be no vars/data beyond this point
end
--------------- unzip152-rdbybits-and-crc32tab.diff ---------------
And I may as well include the C code to generate the table, again
based on degzip_portable.c:
-------------------- gentable.c --------------------
#include <stdio.h>
int main(void)
{
unsigned long c,i,j;
for(i=0;i<256;i++)
{
if((i&1)==0) printf("\tdb\t");
c=i;
for(j=0;j<8;j++)
c=(c>>1)^((c&1)?0xedb88320:0);
printf("%03Xh,%03Xh,%03Xh,%03Xh",
c&255,(c>>8)&255,(c>>16)&255,(c>>24)&255);
putchar(((i&1)==1)?'\n':',');
}
}
-------------------- gentable.c --------------------
-Rus.