Russell Marks <zgedneil@spam^H^H^H^Hgmail.com> wrote:
>>>>> > On Wednesday, September 9, 2020 at 11:09:35 AM UTC+10, Tony Nicholson wrote: 
>>> [...]
>>>>> >> 
https://github.com/agn453/UNZIP-CPM-Z80 
[...]
>>> I have to say - maybe directing this more at Tony/Martin? - just
>>> looking at the code generally my first thought on optimising for speed
Here we go again. :-) I thought a constant in the degzip_portable.c
that the deflate code is based on looked familiar, and it is - so I
ported over the table-based CRC code from that. Now this is
"expensive" as the table is 1k long and pushes the COM file slightly
past 5k, but extracting my deflate test zip is 32% quicker with this
(combined with my previous changes) as compared with UNZIP152. It
would be possible to construct the table at runtime of course, but on
a Z80 I imagine a precalculated table might be for the best.
(While I'm posting I may as well note that I ran the version *before*
this table-CRC change against every zip on the Walnut Creek CP/M CD
earlier on, with no CRC errors. Obviously the goal there was to check
the non-deflate code is still working ok, and it seems to be.)
Here's the overall patch against UNZIP152.Z80:
--------------- unzip152-rdbybits-and-crc32tab.diff ---------------
--- UNZIP152.Z80	2020-10-13 16:02:05.906861502 +0100
+++ unzip152-new3.z80	2020-10-15 02:27:13.342645445 +0100
@@ -716,34 +755,37 @@
 	ld	(hl),a
 	ret
 ;
-updcrc:	ld	hl,(crc32)
+; based on this from crc32() in degzip_portable.c:
+;    for (i = 0; i < len; i++)
+;        crc = crc32_tab[(uint8_t)(crc ^ *b++)] ^ (crc >> 8);
+;
+updcrc:	ld	bc,(crc32)
+	xor	c		; A=low byte of crc xor output byte
+	ld	h,0
+	ld	l,a
+	add	hl,hl		; *2
+	add	hl,hl		; *4
+	ld	de,crc32tab
+	add	hl,de
 	ld	de,(crc32 + 2)
+	; now DEBC is "crc", and HL points to low byte of
+	; relevant crc32tab entry. Do the xor with "crc"/256,
+	; starting from the low bytes.
+	ld	a,(hl)
+	xor	b
 	ld	c,a
-	ld	b,8
-crclp:	ld	a,l
-	xor	c
-	srl	c
-	srl	d
-	rr	e
-	rr	h
-	rr	l
-	rra
-	jr	nc,noxor
-	ld	a,d
-	xor	0edh
-	ld	d,a
-	ld	a,e
-	xor	0b8h
+	inc	hl
+	ld	a,(hl)
+	xor	e
+	ld	b,a
+	inc	hl
+	ld	a,(hl)
+	xor	d
 	ld	e,a
-	ld	a,h
-	xor	83h
-	ld	h,a
-	ld	a,l
-	xor	20h
-	ld	l,a
-noxor:	djnz	crclp
-	ld	(crc32),hl
+	inc	hl
+	ld	d,(hl)		; high byte is a simple copy
 	ld	(crc32 + 2),de
+	ld	(crc32),bc
 	ret
 ;
 unshrink:
@@ -984,7 +1026,7 @@
 lflp:	push	bc
 	push	hl
 	ld	a,6
-	call	readbits
+	call	rdbybits
 	pop	hl
 	pop	de
 	ld	(hl),a
@@ -999,7 +1041,7 @@
 ldfllp:	push	hl
 	push	bc
 	ld	a,8
-	call	readbits
+	call	rdbybits
 	pop	bc
 	pop	hl
 	ld	(hl),a
@@ -1035,11 +1077,11 @@
 	or	a
 	jr	nz,ur2
 ur4:	ld	a,8
-	call	readbits
+	call	rdbybits
 	jr	ur3
 ;
 ur2:	ld	a,1
-	call	readbits
+	call	rdbybits
 	dec	l
 	jr	z,ur4
 	call	slenlch
@@ -1073,7 +1115,7 @@
 	ld	a,l
 	or	a
 	jr	z,ur10
-	ld	(V),a
+	ld	(urV),a
 	ld	a,(L_table)
 	ld	h,a
 	and	l
@@ -1106,7 +1148,7 @@
 	jr	nz,ur13
 	ld	a,(D_shift)
 	ld	b,a
-	ld	a,(V)
+	ld	a,(urV)
 ur14:	srl	a
 	djnz	ur14
 	ld	h,a
@@ -1191,7 +1233,7 @@
 ;
 readlengths:
 	ld	a,8
-	call	readbits
+	call	rdbybits
 	ld	d,h
 	ld	e,d
 	inc	hl
@@ -1211,11 +1253,11 @@
 	push	de
 	push	hl
 	ld	a,4
-	call	readbits
+	call	rdbybits
 	inc	a
 	push	af
 	ld	a,4
-	call	readbits
+	call	rdbybits
 	inc	a
 	ld	b,a
 	pop	af
@@ -1412,7 +1454,7 @@
 	push	de
 	push	bc
 	ld	a,1
-	call	readbits
+	call	rdbybits
 	pop	af
 	push	af
 	or	a
@@ -1487,7 +1529,7 @@
 	jr	ui4
 ;
 ui3:	ld	a,8
-	call	readbits
+	call	rdbybits
 ui4:	call	outb
 	jr	ui1
 ;
@@ -1512,7 +1554,7 @@
 	jr	nz,ui6
 	push	hl
 	ld	a,8
-	call	readbits
+	call	rdbybits
 	pop	de
 	add	hl,de
 ui6:	ld	de,(mml)
@@ -1529,7 +1571,7 @@
 	ld	(treep),hl
 nsloop:	push	hl
 	ld	a,1
-	call	readbits
+	call	rdbybits
 	pop	hl
 	or	a
 	jr	z,nsleft
@@ -1730,19 +1772,19 @@
 ;
 huffman:
 	ld	a,5
-	call	readbits
+	call	rdbybits
 	inc	a
 	ld	l,a
 	ld	h,1
 	ld	(hlit),hl
 
 	ld	a,5
-	call	readbits
+	call	rdbybits
 	inc	a
 	ld	(hdist),a
 
 	ld	a,4
-	call	readbits
+	call	rdbybits
 	add	a,4
 	ld	c,a
 
@@ -1754,7 +1796,7 @@
 	push	bc
 	push	de
 	ld	a,3
-	call	readbits
+	call	rdbybits
 	pop	hl
 	ld	c,(hl)
 	ld	b,0
@@ -1805,7 +1847,7 @@
 	cp	010h
 	jr	nz,hmn16
 	ld	a,2
-	call	readbits
+	call	rdbybits
 	pop	hl
 	pop	bc
 	add	a,3
@@ -1823,7 +1865,7 @@
 hmn16:	cp	011h
 	jr	nz,hmn17
 	ld	a,3
-	call	readbits
+	call	rdbybits
 	pop	hl
 	pop	bc
 	add	a,3
@@ -1839,7 +1881,7 @@
 hmn17:	cp	012h
 	jr	nz,hmn18
 	ld	a,7
-	call	readbits
+	call	rdbybits
 	pop	hl
 	pop	bc
 	add	a,11
@@ -1965,11 +2007,11 @@
 	ret	nz
 
 	ld	a,1
-	call	readbits
+	call	rdbybits
 	push	af
 
 	ld	a,2
-	call	readbits
+	call	rdbybits
 	or	a
 	jr	nz,udnt0
 
@@ -2125,10 +2167,10 @@
 counting:
 	db	0
 init:
-	db	0
-	db	0
-	dw	0,0
-	dw	-1,-1
+	db	0	; for bleft
+	db	0	; for wrtpt
+	dw	0,0	; for outpos
+	dw	-1,-1	; for crc32
 endinit:
 inbufp:	dw	0080h
 readpt:	db	80h
@@ -2211,6 +2253,135 @@
 	db	06dh, 0dbh, 0b6h, 06dh, 0dbh, 0b6h, 0cdh, 0dbh
 	db	0b6h, 06dh, 0dbh, 0b6h, 06dh, 0dbh, 0a8h, 06dh
 	db	0ceh, 08bh, 06dh, 03bh
+crc32tab:	; crc32_tab[] from degzip_portable.c, takes 1k
+	db	000h,000h,000h,000h,096h,030h,007h,077h
+	db	02Ch,061h,00Eh,0EEh,0BAh,051h,009h,099h
+	db	019h,0C4h,06Dh,007h,08Fh,0F4h,06Ah,070h
+	db	035h,0A5h,063h,0E9h,0A3h,095h,064h,09Eh
+	db	032h,088h,0DBh,00Eh,0A4h,0B8h,0DCh,079h
+	db	01Eh,0E9h,0D5h,0E0h,088h,0D9h,0D2h,097h
+	db	02Bh,04Ch,0B6h,009h,0BDh,07Ch,0B1h,07Eh
+	db	007h,02Dh,0B8h,0E7h,091h,01Dh,0BFh,090h
+	db	064h,010h,0B7h,01Dh,0F2h,020h,0B0h,06Ah
+	db	048h,071h,0B9h,0F3h,0DEh,041h,0BEh,084h
+	db	07Dh,0D4h,0DAh,01Ah,0EBh,0E4h,0DDh,06Dh
+	db	051h,0B5h,0D4h,0F4h,0C7h,085h,0D3h,083h
+	db	056h,098h,06Ch,013h,0C0h,0A8h,06Bh,064h
+	db	07Ah,0F9h,062h,0FDh,0ECh,0C9h,065h,08Ah
+	db	04Fh,05Ch,001h,014h,0D9h,06Ch,006h,063h
+	db	063h,03Dh,00Fh,0FAh,0F5h,00Dh,008h,08Dh
+	db	0C8h,020h,06Eh,03Bh,05Eh,010h,069h,04Ch
+	db	0E4h,041h,060h,0D5h,072h,071h,067h,0A2h
+	db	0D1h,0E4h,003h,03Ch,047h,0D4h,004h,04Bh
+	db	0FDh,085h,00Dh,0D2h,06Bh,0B5h,00Ah,0A5h
+	db	0FAh,0A8h,0B5h,035h,06Ch,098h,0B2h,042h
+	db	0D6h,0C9h,0BBh,0DBh,040h,0F9h,0BCh,0ACh
+	db	0E3h,06Ch,0D8h,032h,075h,05Ch,0DFh,045h
+	db	0CFh,00Dh,0D6h,0DCh,059h,03Dh,0D1h,0ABh
+	db	0ACh,030h,0D9h,026h,03Ah,000h,0DEh,051h
+	db	080h,051h,0D7h,0C8h,016h,061h,0D0h,0BFh
+	db	0B5h,0F4h,0B4h,021h,023h,0C4h,0B3h,056h
+	db	099h,095h,0BAh,0CFh,00Fh,0A5h,0BDh,0B8h
+	db	09Eh,0B8h,002h,028h,008h,088h,005h,05Fh
+	db	0B2h,0D9h,00Ch,0C6h,024h,0E9h,00Bh,0B1h
+	db	087h,07Ch,06Fh,02Fh,011h,04Ch,068h,058h
+	db	0ABh,01Dh,061h,0C1h,03Dh,02Dh,066h,0B6h
+	db	090h,041h,0DCh,076h,006h,071h,0DBh,001h
+	db	0BCh,020h,0D2h,098h,02Ah,010h,0D5h,0EFh
+	db	089h,085h,0B1h,071h,01Fh,0B5h,0B6h,006h
+	db	0A5h,0E4h,0BFh,09Fh,033h,0D4h,0B8h,0E8h
+	db	0A2h,0C9h,007h,078h,034h,0F9h,000h,00Fh
+	db	08Eh,0A8h,009h,096h,018h,098h,00Eh,0E1h
+	db	0BBh,00Dh,06Ah,07Fh,02Dh,03Dh,06Dh,008h
+	db	097h,06Ch,064h,091h,001h,05Ch,063h,0E6h
+	db	0F4h,051h,06Bh,06Bh,062h,061h,06Ch,01Ch
+	db	0D8h,030h,065h,085h,04Eh,000h,062h,0F2h
+	db	0EDh,095h,006h,06Ch,07Bh,0A5h,001h,01Bh
+	db	0C1h,0F4h,008h,082h,057h,0C4h,00Fh,0F5h
+	db	0C6h,0D9h,0B0h,065h,050h,0E9h,0B7h,012h
+	db	0EAh,0B8h,0BEh,08Bh,07Ch,088h,0B9h,0FCh
+	db	0DFh,01Dh,0DDh,062h,049h,02Dh,0DAh,015h
+	db	0F3h,07Ch,0D3h,08Ch,065h,04Ch,0D4h,0FBh
+	db	058h,061h,0B2h,04Dh,0CEh,051h,0B5h,03Ah
+	db	074h,000h,0BCh,0A3h,0E2h,030h,0BBh,0D4h
+	db	041h,0A5h,0DFh,04Ah,0D7h,095h,0D8h,03Dh
+	db	06Dh,0C4h,0D1h,0A4h,0FBh,0F4h,0D6h,0D3h
+	db	06Ah,0E9h,069h,043h,0FCh,0D9h,06Eh,034h
+	db	046h,088h,067h,0ADh,0D0h,0B8h,060h,0DAh
+	db	073h,02Dh,004h,044h,0E5h,01Dh,003h,033h
+	db	05Fh,04Ch,00Ah,0AAh,0C9h,07Ch,00Dh,0DDh
+	db	03Ch,071h,005h,050h,0AAh,041h,002h,027h
+	db	010h,010h,00Bh,0BEh,086h,020h,00Ch,0C9h
+	db	025h,0B5h,068h,057h,0B3h,085h,06Fh,020h
+	db	009h,0D4h,066h,0B9h,09Fh,0E4h,061h,0CEh
+	db	00Eh,0F9h,0DEh,05Eh,098h,0C9h,0D9h,029h
+	db	022h,098h,0D0h,0B0h,0B4h,0A8h,0D7h,0C7h
+	db	017h,03Dh,0B3h,059h,081h,00Dh,0B4h,02Eh
+	db	03Bh,05Ch,0BDh,0B7h,0ADh,06Ch,0BAh,0C0h
+	db	020h,083h,0B8h,0EDh,0B6h,0B3h,0BFh,09Ah
+	db	00Ch,0E2h,0B6h,003h,09Ah,0D2h,0B1h,074h
+	db	039h,047h,0D5h,0EAh,0AFh,077h,0D2h,09Dh
+	db	015h,026h,0DBh,004h,083h,016h,0DCh,073h
+	db	012h,00Bh,063h,0E3h,084h,03Bh,064h,094h
+	db	03Eh,06Ah,06Dh,00Dh,0A8h,05Ah,06Ah,07Ah
+	db	00Bh,0CFh,00Eh,0E4h,09Dh,0FFh,009h,093h
+	db	027h,0AEh,000h,00Ah,0B1h,09Eh,007h,07Dh
+	db	044h,093h,00Fh,0F0h,0D2h,0A3h,008h,087h
+	db	068h,0F2h,001h,01Eh,0FEh,0C2h,006h,069h
+	db	05Dh,057h,062h,0F7h,0CBh,067h,065h,080h
+	db	071h,036h,06Ch,019h,0E7h,006h,06Bh,06Eh
+	db	076h,01Bh,0D4h,0FEh,0E0h,02Bh,0D3h,089h
+	db	05Ah,07Ah,0DAh,010h,0CCh,04Ah,0DDh,067h
+	db	06Fh,0DFh,0B9h,0F9h,0F9h,0EFh,0BEh,08Eh
+	db	043h,0BEh,0B7h,017h,0D5h,08Eh,0B0h,060h
+	db	0E8h,0A3h,0D6h,0D6h,07Eh,093h,0D1h,0A1h
+	db	0C4h,0C2h,0D8h,038h,052h,0F2h,0DFh,04Fh
+	db	0F1h,067h,0BBh,0D1h,067h,057h,0BCh,0A6h
+	db	0DDh,006h,0B5h,03Fh,04Bh,036h,0B2h,048h
+	db	0DAh,02Bh,00Dh,0D8h,04Ch,01Bh,00Ah,0AFh
+	db	0F6h,04Ah,003h,036h,060h,07Ah,004h,041h
+	db	0C3h,0EFh,060h,0DFh,055h,0DFh,067h,0A8h
+	db	0EFh,08Eh,06Eh,031h,079h,0BEh,069h,046h
+	db	08Ch,0B3h,061h,0CBh,01Ah,083h,066h,0BCh
+	db	0A0h,0D2h,06Fh,025h,036h,0E2h,068h,052h
+	db	095h,077h,00Ch,0CCh,003h,047h,00Bh,0BBh
+	db	0B9h,016h,002h,022h,02Fh,026h,005h,055h
+	db	0BEh,03Bh,0BAh,0C5h,028h,00Bh,0BDh,0B2h
+	db	092h,05Ah,0B4h,02Bh,004h,06Ah,0B3h,05Ch
+	db	0A7h,0FFh,0D7h,0C2h,031h,0CFh,0D0h,0B5h
+	db	08Bh,09Eh,0D9h,02Ch,01Dh,0AEh,0DEh,05Bh
+	db	0B0h,0C2h,064h,09Bh,026h,0F2h,063h,0ECh
+	db	09Ch,0A3h,06Ah,075h,00Ah,093h,06Dh,002h
+	db	0A9h,006h,009h,09Ch,03Fh,036h,00Eh,0EBh
+	db	085h,067h,007h,072h,013h,057h,000h,005h
+	db	082h,04Ah,0BFh,095h,014h,07Ah,0B8h,0E2h
+	db	0AEh,02Bh,0B1h,07Bh,038h,01Bh,0B6h,00Ch
+	db	09Bh,08Eh,0D2h,092h,00Dh,0BEh,0D5h,0E5h
+	db	0B7h,0EFh,0DCh,07Ch,021h,0DFh,0DBh,00Bh
+	db	0D4h,0D2h,0D3h,086h,042h,0E2h,0D4h,0F1h
+	db	0F8h,0B3h,0DDh,068h,06Eh,083h,0DAh,01Fh
+	db	0CDh,016h,0BEh,081h,05Bh,026h,0B9h,0F6h
+	db	0E1h,077h,0B0h,06Fh,077h,047h,0B7h,018h
+	db	0E6h,05Ah,008h,088h,070h,06Ah,00Fh,0FFh
+	db	0CAh,03Bh,006h,066h,05Ch,00Bh,001h,011h
+	db	0FFh,09Eh,065h,08Fh,069h,0AEh,062h,0F8h
+	db	0D3h,0FFh,06Bh,061h,045h,0CFh,06Ch,016h
+	db	078h,0E2h,00Ah,0A0h,0EEh,0D2h,00Dh,0D7h
+	db	054h,083h,004h,04Eh,0C2h,0B3h,003h,039h
+	db	061h,026h,067h,0A7h,0F7h,016h,060h,0D0h
+	db	04Dh,047h,069h,049h,0DBh,077h,06Eh,03Eh
+	db	04Ah,06Ah,0D1h,0AEh,0DCh,05Ah,0D6h,0D9h
+	db	066h,00Bh,0DFh,040h,0F0h,03Bh,0D8h,037h
+	db	053h,0AEh,0BCh,0A9h,0C5h,09Eh,0BBh,0DEh
+	db	07Fh,0CFh,0B2h,047h,0E9h,0FFh,0B5h,030h
+	db	01Ch,0F2h,0BDh,0BDh,08Ah,0C2h,0BAh,0CAh
+	db	030h,093h,0B3h,053h,0A6h,0A3h,0B4h,024h
+	db	005h,036h,0D0h,0BAh,093h,006h,0D7h,0CDh
+	db	029h,057h,0DEh,054h,0BFh,067h,0D9h,023h
+	db	02Eh,07Ah,066h,0B3h,0B8h,04Ah,061h,0C4h
+	db	002h,01Bh,068h,05Dh,094h,02Bh,06Fh,02Ah
+	db	037h,0BEh,00Bh,0B4h,0A1h,08Eh,00Ch,0C3h
+	db	01Bh,0DFh,005h,05Ah,08Dh,0EFh,002h,02Dh
 ;
 ; uninitialized storage
 ;
@@ -2237,6 +2408,7 @@
 	ds	24
 mtchfcb:
 	ds	11
+; note that as indicated above, bitbuf must be the byte before bleft
 bitbuf:	ds	1
 vars:
 bleft:	ds	1
@@ -2250,7 +2422,7 @@
 	ds	1
 D_shift:
 	ds	1
-V:	ds	1
+urV:	ds	1
 nchar:	ds	1
 lchar:	ds	1
 ExState:
@@ -2311,5 +2483,5 @@
 disttr:	ds	4 * nrdist
 endtr:
 	ds	8192 + 2 - (endtr - lenld)
-
+endaddr:	; must be no vars/data beyond this point
 	end
--------------- unzip152-rdbybits-and-crc32tab.diff ---------------
And I may as well include the C code to generate the table, again
based on degzip_portable.c:
-------------------- gentable.c --------------------
#include <stdio.h>
int main(void)
{
unsigned long c,i,j;
for(i=0;i<256;i++)
  {
  if((i&1)==0) printf("\tdb\t");
  c=i;
  for(j=0;j<8;j++)
    c=(c>>1)^((c&1)?0xedb88320:0);
  printf("%03Xh,%03Xh,%03Xh,%03Xh",
         c&255,(c>>8)&255,(c>>16)&255,(c>>24)&255);
  putchar(((i&1)==1)?'\n':',');
  }
}
-------------------- gentable.c --------------------
-Rus.