[go] hash/crc32: interleave PMULL with CRC to accelerate CRC32 for ARM64

Ruinan Sun (Gerrit)

unread,

Jun 6, 2022, 12:18:02 AM6/6/22

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Ruinan Sun has uploaded this change for review.

hash/crc32: interleave PMULL with CRC to accelerate CRC32 for ARM64

This CL interleaves the PMULL instruction with CRC instruction to
accelerate the CRC32 computation for ARM64. In most modern ARM
processors, PMULL and CRC can be executed in different pipelines, so
this interleaving will get better parallelism and performance.

Benchmark                           master     ThisRun     Delta
CRC32/poly=IEEE/size=4kB/align=0    8.78GB/s   11.72GB/s   +33.45%
CRC32/poly=IEEE/size=4kB/align=1    8.79GB/s    8.61GB/s    -2.05%
CRC32/poly=IEEE/size=32kB/align=0   8.51GB/s   11.58GB/s   +36.05%
CRC32/poly=IEEE/size=32kB/align=1   8.51GB/s   11.07GB/s   +30.10%

Change-Id: I900e65b0d76ff86681a06ccd17d278f2990d0354
---
M src/hash/crc32/crc32_arm64.go
M src/hash/crc32/crc32_arm64.s
2 files changed, 218 insertions(+), 1 deletion(-)

diff --git a/src/hash/crc32/crc32_arm64.go b/src/hash/crc32/crc32_arm64.go
index 9674b76..25493ae 100644
--- a/src/hash/crc32/crc32_arm64.go
+++ b/src/hash/crc32/crc32_arm64.go
@@ -8,10 +8,14 @@
 
 package crc32
 
-import "internal/cpu"
+import (
+	"internal/cpu"
+	"unsafe"
+)
 
 func castagnoliUpdate(crc uint32, p []byte) uint32
 func ieeeUpdate(crc uint32, p []byte) uint32
+func ieeeInterleave(crc uint32, p []byte) uint32
 
 func archAvailableCastagnoli() bool {
 	return cpu.ARM64.HasCRC32
@@ -46,5 +50,22 @@
 		panic("arch-specific crc32 instruction for IEEE not available")
 	}
 
+	if cpu.ARM64.HasPMULL && len(p) >= 4096 {
+		length := len(p)
+		// align with 16 bytes
+		head := 16 - uintptr(unsafe.Pointer(&p[0]))%16
+		if head != 16 {
+			crc = ^ieeeUpdate(^crc, p[:head])
+			length -= int(head)
+			p = p[head:]
+		}
+		// the length may less than 4K because of the previous alignment, check it again.
+		if length >= 4096 {
+			length -= length % 4096
+			crc = ^ieeeInterleave(^crc, p[:length])
+			p = p[length:]
+		}
+	}
+
 	return ^ieeeUpdate(^crc, p)
 }
diff --git a/src/hash/crc32/crc32_arm64.s b/src/hash/crc32/crc32_arm64.s
index 85a113f..97ee29b 100644
--- a/src/hash/crc32/crc32_arm64.s
+++ b/src/hash/crc32/crc32_arm64.s
@@ -95,3 +95,179 @@
 done:
 	MOVWU	R9, ret+32(FP)
 	RET
+
+// 6 * 16 = 96 bytes, compute by CRC32X
+#define CRC32BLOCK \
+	LDP.P	16(R4), (R5, R6) \
+	CRC32X	R5, R0, R0 \
+	CRC32X	R6, R0, R0 \
+	LDP.P	16(R4), (R5, R6) \
+	CRC32X	R5, R0, R0 \
+	CRC32X	R6, R0, R0 \
+	LDP.P	16(R4), (R5, R6) \
+	CRC32X	R5, R0, R0 \
+	CRC32X	R6, R0, R0 \
+	LDP.P	16(R4), (R5, R6) \
+	CRC32X	R5, R0, R0 \
+	CRC32X	R6, R0, R0 \
+	LDP.P	16(R4), (R5, R6) \
+	CRC32X	R5, R0, R0 \
+	CRC32X	R6, R0, R0 \
+	LDP.P	16(R4), (R5, R6) \
+	CRC32X	R5, R0, R0 \
+	CRC32X	R6, R0, R0
+
+// 2 * 16 = 32 bytes, compute by PMULL
+#define PMULLBLOCK \
+	VLD1.P	32(R3), [V2.B16, V3.B16] \
+	VPMULL	V16.D1, V0.D1, V8.Q1 \
+	VPMULL2	V16.D2, V0.D2, V0.Q1 \
+	VPMULL	V16.D1, V1.D1, V9.Q1 \
+	VPMULL2	V16.D2, V1.D2, V1.Q1 \
+	VEOR	V0.B16, V8.B16, V8.B16 \
+	VEOR	V1.B16, V9.B16, V9.B16 \
+	VEOR	V2.B16, V8.B16, V0.B16 \
+	VEOR	V3.B16, V9.B16, V1.B16
+
+// ieeeInterleave updates the non-inverted crc with the given data.
+// Basic idea:
+//   1. Seperate the data M(x) into several parts.
+//   2. Compute the result for each part.
+//   3. Merge the results.
+//
+// Explanation:
+//   In the following formula: M(x) is the data, l1 = length(M(x))
+//
+//   crc32(initCRC, M(x)) = R(x) = (initCRC * 2^l1 + crc32(0, M(x))) mod P(x)
+//                               = initCRC * 2^(l1-32) * 2^32 mod P(x) + crc32(0, M(x))
+//
+//   let C1 = 2^(l1-32) mod P(x), we get:
+//   crc32(initCRC, M(x)) = R(x) = crc32(initCRC * C1) + crc32(0, M(x))
+//
+//   split M(x) into H(x) and L(x), let l2 = length(L(x))
+//   crc32(0, M(x)) = (H(x) * 2^l2 + L(x)) * 2^32 mod P(x)
+//                  = crc32(0, H(x)) * 2^l2 mod P(x) + crc32(0, L(x))
+//                  = crc32(0, H(x)) * 2^(l2-32) * 2^32 mod P(x) + crc32(0, L(x))
+//
+//   let: crc1 = crc32(0, H(x))
+//        crc2 = crc32(0, L(x))
+//        C2 = 2^(l2-32) mod P(x)
+//   finally we get:
+//   crc32(initCRC, M(x)) = R(x) = crc32(0, initCRC * C1) + crc32(0, crc1 * C2) + crc2
+//
+//   C1 and C2 can be pre-computed.
+//
+// Implementation:
+//   Our implementation will interleave CRC instructions with PMULL instructions to
+//   compute the final result. In the PMULL folding part we use the similar idea
+//   as ieeeCLMUL implemented by crc32_amd64.s. Since PMULL and CRC instructions can
+//   be executed in the different pipelines in most modern ARM processors, this
+//   interleaving will get better throughput and parallelism.
+//
+//   Since PMULL is not fast as CRC32 instructions, we won't use PMULL a lot in the calculation.
+//   According to our tests, this interleaving improves the performance for most ARM processors
+//   when the block size is 4K and the rate of CRC32X to PMULL mix is 3:1.
+//
+//
+//      (low address)                      M(x)                       (high address)
+//           -----------------------------------------------------------------
+//                block 1    |     block 2    |  ... |   block n    |   ...
+//           -----------------------------------------------------------------
+//                4K bytes        4K bytes
+//
+//                         block n
+//           --------------------------------
+//                    H(x)       |   L(x)
+//           --------------------------------
+//                 3072 bytes      1024 bytes
+//
+//
+// func ieeeInterleave(crc uint32, p []byte) uint32
+// len(p) is a multiple of 4096
+TEXT ·ieeeInterleave(SB),NOSPLIT,$0-36
+	MOVWU	crc+0(FP), R0     // initCRC
+	MOVD	p+8(FP), R1       // data pointer
+	MOVD	p_len+16(FP), R2  // len(p)
+	VMOVQ	$0x00000000f1da05aa, $0x0000000081256527, V16 // folding const
+	VMOVQ	$0x00000001751997d0, $0x00000000ccaa009e, V17 // folding const
+	VMOVD	$0x0000000163cd6124, V18 // folding const
+	VMOVD	$0x00000001F7011641, V19 // mu, a const used for barrett reduction
+	VMOVD	$0x00000001DB710641, V20 // P(x)
+	VMOVD	$0x00000000bbf2f6d6, V21 // const C2
+	VMOVD	$0x0000000068c0a2c5, V22 // const C1
+
+loop4096:
+	// record the initial value(R0) in R10 and set R0 to zero, so the CRC instructions
+	// in this iteration will have no dependency with the previous results, which will
+	// benefit the out of order processors.
+	MOVD	R0, R10
+	MOVD	ZR, R0
+	MOVD	R1, R4
+	// get start address of L(x)
+	ADD	$3072, R4, R3
+	// load first 2 * 16 = 32 bytes as the initial folding value for PMULLBLOCK
+	VLD1.P	32(R3), [V0.B16, V1.B16]
+	ADD	$4096, R1, R1
+	SUB	$4096, R2, R2
+
+	// Per loop computes 128 bytes, so we need 4096/128 = 32 rounds computation.
+	// Because the first 32 bytes in PMULL block are pre-load, only 31 rounds in the loop.
+	MOVD	$31, R9
+innerLoop:
+	CRC32BLOCK
+	PMULLBLOCK
+
+	SUBS	$1, R9, R9
+	BNE	innerLoop
+
+	// Only 31 rounds for CRC32BLOCK, so we have to do another round here.
+	CRC32BLOCK
+	// we already have crc1 = CRC32(0, H(x)), stored in R0
+
+	// compute crc2
+	// fold 256 bytes to 128 bytes
+	VPMULL	V17.D1, V0.D1, V8.Q1
+	VPMULL2	V17.D2, V0.D2, V0.Q1
+	VEOR	V8.B16, V0.B16, V0.B16
+	VEOR	V1.B16, V0.B16, V0.B16
+
+	// fold 128 bytes to 96 bytes
+	VEXT	$8, V0.B16, V0.B16, V0.B16
+	VPMULL2	V17.D2, V0.D2, V8.Q1
+	VMOV	ZR, V0.D[1]
+	VEOR	V0.B16, V8.B16, V0.B16
+
+	// fold 96 bytes to 64 bytes
+	VEOR	V8.B16, V8.B16, V8.B16
+	VMOV	V0.S[0], V8.S[0]
+	VPMULL	V18.D1, V8.D1, V8.Q1
+	VEXT	$4, V0.B16, V0.B16, V0.B16
+	VEOR	V0.B16, V8.B16, V0.B16
+
+	// barrett reduction
+	VEOR	V8.B16, V8.B16, V8.B16
+	VMOV	V0.S[0], V8.S[0]
+	VPMULL	V19.D1, V8.D1, V8.Q1
+	VMOV	ZR, V8.S[1]
+	VPMULL	V20.D1, V8.D1, V8.Q1
+	VEOR	V8.B16, V0.B16, V0.B16
+	// we already get crc2, stored in V0.S[1]
+
+	// merge results from both parts
+	// R(x) = crc32(0, initCRC * C1) + crc32(0, crc1 * C2) + crc2
+	VMOV	R0, V8.D[0]
+	VMOV	R10, V9.D[0]
+	VPMULL	V21.D1, V8.D1, V8.Q1
+	VPMULL	V22.D1, V9.D1, V9.Q1
+	VMOV	V8.D[0], R0
+	VMOV	V9.D[0], R10
+	CRC32X	R0, ZR, R0
+	CRC32X	R10, ZR, R10
+	EORW	R0, R10, R0
+	VMOV	V0.S[1], R7
+	EORW	R0, R7, R0
+	CMP	$4096, R2
+	BGE	loop4096
+end:
+	MOVWU	R0, ret+32(FP)
+	RET

To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

Eric Fang (Gerrit)

unread,

Jun 6, 2022, 3:56:20 AM6/6/22

to Ruinan Sun, goph...@pubsubhelper.golang.org, Cherry Mui, Keith Randall, golang-co...@googlegroups.com

Attention is currently required from: Cherry Mui, Keith Randall, Ruinan Sun.

Patch set 1:Run-TryBot +1

View Change

To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

Ruinan Sun (Gerrit)

unread,

Aug 24, 2022, 11:29:21 PM8/24/22

to goph...@pubsubhelper.golang.org, Filippo Valsorda, Gopher Robot, Eric Fang, Cherry Mui, Keith Randall, golang-co...@googlegroups.com

Attention is currently required from: Cherry Mui, Filippo Valsorda, Keith Randall.

View Change

1 comment:

Patchset:
- Patch Set #1:
  This is a pipeline optimization about crc32 on arm64.
  A kindly ping, Thanks~!

To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

Eric Fang (Gerrit)

unread,

Sep 29, 2022, 11:08:41 PM9/29/22

to Ruinan Sun, goph...@pubsubhelper.golang.org, Filippo Valsorda, Gopher Robot, Cherry Mui, Keith Randall, golang-co...@googlegroups.com

Attention is currently required from: Cherry Mui, Filippo Valsorda, Keith Randall, Ruinan Sun.

Patch set 2:Run-TryBot +1

View Change

To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

Reply all

Reply to author

Forward