[go] hash/crc32: interleave PMULL with CRC to accelerate CRC32 for ARM64

155 views
Skip to first unread message

Ruinan Sun (Gerrit)

unread,
Jun 6, 2022, 12:18:02 AM6/6/22
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Ruinan Sun has uploaded this change for review.

View Change

hash/crc32: interleave PMULL with CRC to accelerate CRC32 for ARM64

This CL interleaves the PMULL instruction with CRC instruction to
accelerate the CRC32 computation for ARM64. In most modern ARM
processors, PMULL and CRC can be executed in different pipelines, so
this interleaving will get better parallelism and performance.

Benchmark master ThisRun Delta
CRC32/poly=IEEE/size=4kB/align=0 8.78GB/s 11.72GB/s +33.45%
CRC32/poly=IEEE/size=4kB/align=1 8.79GB/s 8.61GB/s -2.05%
CRC32/poly=IEEE/size=32kB/align=0 8.51GB/s 11.58GB/s +36.05%
CRC32/poly=IEEE/size=32kB/align=1 8.51GB/s 11.07GB/s +30.10%

Change-Id: I900e65b0d76ff86681a06ccd17d278f2990d0354
---
M src/hash/crc32/crc32_arm64.go
M src/hash/crc32/crc32_arm64.s
2 files changed, 218 insertions(+), 1 deletion(-)

diff --git a/src/hash/crc32/crc32_arm64.go b/src/hash/crc32/crc32_arm64.go
index 9674b76..25493ae 100644
--- a/src/hash/crc32/crc32_arm64.go
+++ b/src/hash/crc32/crc32_arm64.go
@@ -8,10 +8,14 @@

package crc32

-import "internal/cpu"
+import (
+ "internal/cpu"
+ "unsafe"
+)

func castagnoliUpdate(crc uint32, p []byte) uint32
func ieeeUpdate(crc uint32, p []byte) uint32
+func ieeeInterleave(crc uint32, p []byte) uint32

func archAvailableCastagnoli() bool {
return cpu.ARM64.HasCRC32
@@ -46,5 +50,22 @@
panic("arch-specific crc32 instruction for IEEE not available")
}

+ if cpu.ARM64.HasPMULL && len(p) >= 4096 {
+ length := len(p)
+ // align with 16 bytes
+ head := 16 - uintptr(unsafe.Pointer(&p[0]))%16
+ if head != 16 {
+ crc = ^ieeeUpdate(^crc, p[:head])
+ length -= int(head)
+ p = p[head:]
+ }
+ // the length may less than 4K because of the previous alignment, check it again.
+ if length >= 4096 {
+ length -= length % 4096
+ crc = ^ieeeInterleave(^crc, p[:length])
+ p = p[length:]
+ }
+ }
+
return ^ieeeUpdate(^crc, p)
}
diff --git a/src/hash/crc32/crc32_arm64.s b/src/hash/crc32/crc32_arm64.s
index 85a113f..97ee29b 100644
--- a/src/hash/crc32/crc32_arm64.s
+++ b/src/hash/crc32/crc32_arm64.s
@@ -95,3 +95,179 @@
done:
MOVWU R9, ret+32(FP)
RET
+
+// 6 * 16 = 96 bytes, compute by CRC32X
+#define CRC32BLOCK \
+ LDP.P 16(R4), (R5, R6) \
+ CRC32X R5, R0, R0 \
+ CRC32X R6, R0, R0 \
+ LDP.P 16(R4), (R5, R6) \
+ CRC32X R5, R0, R0 \
+ CRC32X R6, R0, R0 \
+ LDP.P 16(R4), (R5, R6) \
+ CRC32X R5, R0, R0 \
+ CRC32X R6, R0, R0 \
+ LDP.P 16(R4), (R5, R6) \
+ CRC32X R5, R0, R0 \
+ CRC32X R6, R0, R0 \
+ LDP.P 16(R4), (R5, R6) \
+ CRC32X R5, R0, R0 \
+ CRC32X R6, R0, R0 \
+ LDP.P 16(R4), (R5, R6) \
+ CRC32X R5, R0, R0 \
+ CRC32X R6, R0, R0
+
+// 2 * 16 = 32 bytes, compute by PMULL
+#define PMULLBLOCK \
+ VLD1.P 32(R3), [V2.B16, V3.B16] \
+ VPMULL V16.D1, V0.D1, V8.Q1 \
+ VPMULL2 V16.D2, V0.D2, V0.Q1 \
+ VPMULL V16.D1, V1.D1, V9.Q1 \
+ VPMULL2 V16.D2, V1.D2, V1.Q1 \
+ VEOR V0.B16, V8.B16, V8.B16 \
+ VEOR V1.B16, V9.B16, V9.B16 \
+ VEOR V2.B16, V8.B16, V0.B16 \
+ VEOR V3.B16, V9.B16, V1.B16
+
+// ieeeInterleave updates the non-inverted crc with the given data.
+// Basic idea:
+// 1. Seperate the data M(x) into several parts.
+// 2. Compute the result for each part.
+// 3. Merge the results.
+//
+// Explanation:
+// In the following formula: M(x) is the data, l1 = length(M(x))
+//
+// crc32(initCRC, M(x)) = R(x) = (initCRC * 2^l1 + crc32(0, M(x))) mod P(x)
+// = initCRC * 2^(l1-32) * 2^32 mod P(x) + crc32(0, M(x))
+//
+// let C1 = 2^(l1-32) mod P(x), we get:
+// crc32(initCRC, M(x)) = R(x) = crc32(initCRC * C1) + crc32(0, M(x))
+//
+// split M(x) into H(x) and L(x), let l2 = length(L(x))
+// crc32(0, M(x)) = (H(x) * 2^l2 + L(x)) * 2^32 mod P(x)
+// = crc32(0, H(x)) * 2^l2 mod P(x) + crc32(0, L(x))
+// = crc32(0, H(x)) * 2^(l2-32) * 2^32 mod P(x) + crc32(0, L(x))
+//
+// let: crc1 = crc32(0, H(x))
+// crc2 = crc32(0, L(x))
+// C2 = 2^(l2-32) mod P(x)
+// finally we get:
+// crc32(initCRC, M(x)) = R(x) = crc32(0, initCRC * C1) + crc32(0, crc1 * C2) + crc2
+//
+// C1 and C2 can be pre-computed.
+//
+// Implementation:
+// Our implementation will interleave CRC instructions with PMULL instructions to
+// compute the final result. In the PMULL folding part we use the similar idea
+// as ieeeCLMUL implemented by crc32_amd64.s. Since PMULL and CRC instructions can
+// be executed in the different pipelines in most modern ARM processors, this
+// interleaving will get better throughput and parallelism.
+//
+// Since PMULL is not fast as CRC32 instructions, we won't use PMULL a lot in the calculation.
+// According to our tests, this interleaving improves the performance for most ARM processors
+// when the block size is 4K and the rate of CRC32X to PMULL mix is 3:1.
+//
+//
+// (low address) M(x) (high address)
+// -----------------------------------------------------------------
+// block 1 | block 2 | ... | block n | ...
+// -----------------------------------------------------------------
+// 4K bytes 4K bytes
+//
+// block n
+// --------------------------------
+// H(x) | L(x)
+// --------------------------------
+// 3072 bytes 1024 bytes
+//
+//
+// func ieeeInterleave(crc uint32, p []byte) uint32
+// len(p) is a multiple of 4096
+TEXT ·ieeeInterleave(SB),NOSPLIT,$0-36
+ MOVWU crc+0(FP), R0 // initCRC
+ MOVD p+8(FP), R1 // data pointer
+ MOVD p_len+16(FP), R2 // len(p)
+ VMOVQ $0x00000000f1da05aa, $0x0000000081256527, V16 // folding const
+ VMOVQ $0x00000001751997d0, $0x00000000ccaa009e, V17 // folding const
+ VMOVD $0x0000000163cd6124, V18 // folding const
+ VMOVD $0x00000001F7011641, V19 // mu, a const used for barrett reduction
+ VMOVD $0x00000001DB710641, V20 // P(x)
+ VMOVD $0x00000000bbf2f6d6, V21 // const C2
+ VMOVD $0x0000000068c0a2c5, V22 // const C1
+
+loop4096:
+ // record the initial value(R0) in R10 and set R0 to zero, so the CRC instructions
+ // in this iteration will have no dependency with the previous results, which will
+ // benefit the out of order processors.
+ MOVD R0, R10
+ MOVD ZR, R0
+ MOVD R1, R4
+ // get start address of L(x)
+ ADD $3072, R4, R3
+ // load first 2 * 16 = 32 bytes as the initial folding value for PMULLBLOCK
+ VLD1.P 32(R3), [V0.B16, V1.B16]
+ ADD $4096, R1, R1
+ SUB $4096, R2, R2
+
+ // Per loop computes 128 bytes, so we need 4096/128 = 32 rounds computation.
+ // Because the first 32 bytes in PMULL block are pre-load, only 31 rounds in the loop.
+ MOVD $31, R9
+innerLoop:
+ CRC32BLOCK
+ PMULLBLOCK
+
+ SUBS $1, R9, R9
+ BNE innerLoop
+
+ // Only 31 rounds for CRC32BLOCK, so we have to do another round here.
+ CRC32BLOCK
+ // we already have crc1 = CRC32(0, H(x)), stored in R0
+
+ // compute crc2
+ // fold 256 bytes to 128 bytes
+ VPMULL V17.D1, V0.D1, V8.Q1
+ VPMULL2 V17.D2, V0.D2, V0.Q1
+ VEOR V8.B16, V0.B16, V0.B16
+ VEOR V1.B16, V0.B16, V0.B16
+
+ // fold 128 bytes to 96 bytes
+ VEXT $8, V0.B16, V0.B16, V0.B16
+ VPMULL2 V17.D2, V0.D2, V8.Q1
+ VMOV ZR, V0.D[1]
+ VEOR V0.B16, V8.B16, V0.B16
+
+ // fold 96 bytes to 64 bytes
+ VEOR V8.B16, V8.B16, V8.B16
+ VMOV V0.S[0], V8.S[0]
+ VPMULL V18.D1, V8.D1, V8.Q1
+ VEXT $4, V0.B16, V0.B16, V0.B16
+ VEOR V0.B16, V8.B16, V0.B16
+
+ // barrett reduction
+ VEOR V8.B16, V8.B16, V8.B16
+ VMOV V0.S[0], V8.S[0]
+ VPMULL V19.D1, V8.D1, V8.Q1
+ VMOV ZR, V8.S[1]
+ VPMULL V20.D1, V8.D1, V8.Q1
+ VEOR V8.B16, V0.B16, V0.B16
+ // we already get crc2, stored in V0.S[1]
+
+ // merge results from both parts
+ // R(x) = crc32(0, initCRC * C1) + crc32(0, crc1 * C2) + crc2
+ VMOV R0, V8.D[0]
+ VMOV R10, V9.D[0]
+ VPMULL V21.D1, V8.D1, V8.Q1
+ VPMULL V22.D1, V9.D1, V9.Q1
+ VMOV V8.D[0], R0
+ VMOV V9.D[0], R10
+ CRC32X R0, ZR, R0
+ CRC32X R10, ZR, R10
+ EORW R0, R10, R0
+ VMOV V0.S[1], R7
+ EORW R0, R7, R0
+ CMP $4096, R2
+ BGE loop4096
+end:
+ MOVWU R0, ret+32(FP)
+ RET

To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: I900e65b0d76ff86681a06ccd17d278f2990d0354
Gerrit-Change-Number: 410348
Gerrit-PatchSet: 1
Gerrit-Owner: Ruinan Sun <Ruina...@arm.com>
Gerrit-MessageType: newchange

Eric Fang (Gerrit)

unread,
Jun 6, 2022, 3:56:20 AM6/6/22
to Ruinan Sun, goph...@pubsubhelper.golang.org, Cherry Mui, Keith Randall, golang-co...@googlegroups.com

Attention is currently required from: Cherry Mui, Keith Randall, Ruinan Sun.

Patch set 1:Run-TryBot +1

View Change

    To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

    Gerrit-Project: go
    Gerrit-Branch: master
    Gerrit-Change-Id: I900e65b0d76ff86681a06ccd17d278f2990d0354
    Gerrit-Change-Number: 410348
    Gerrit-PatchSet: 1
    Gerrit-Owner: Ruinan Sun <Ruina...@arm.com>
    Gerrit-Reviewer: Cherry Mui <cher...@google.com>
    Gerrit-Reviewer: Eric Fang <eric...@arm.com>
    Gerrit-Reviewer: Keith Randall <k...@google.com>
    Gerrit-Attention: Keith Randall <k...@google.com>
    Gerrit-Attention: Ruinan Sun <Ruina...@arm.com>
    Gerrit-Attention: Cherry Mui <cher...@google.com>
    Gerrit-Comment-Date: Mon, 06 Jun 2022 07:56:14 +0000
    Gerrit-HasComments: No
    Gerrit-Has-Labels: Yes
    Gerrit-MessageType: comment

    Ruinan Sun (Gerrit)

    unread,
    Aug 24, 2022, 11:29:21 PM8/24/22
    to goph...@pubsubhelper.golang.org, Filippo Valsorda, Gopher Robot, Eric Fang, Cherry Mui, Keith Randall, golang-co...@googlegroups.com

    Attention is currently required from: Cherry Mui, Filippo Valsorda, Keith Randall.

    View Change

    1 comment:

    • Patchset:

      • Patch Set #1:

        This is a pipeline optimization about crc32 on arm64.

        A kindly ping, Thanks~!

    To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

    Gerrit-Project: go
    Gerrit-Branch: master
    Gerrit-Change-Id: I900e65b0d76ff86681a06ccd17d278f2990d0354
    Gerrit-Change-Number: 410348
    Gerrit-PatchSet: 1
    Gerrit-Owner: Ruinan Sun <Ruina...@arm.com>
    Gerrit-Reviewer: Cherry Mui <cher...@google.com>
    Gerrit-Reviewer: Eric Fang <eric...@arm.com>
    Gerrit-Reviewer: Filippo Valsorda <fil...@golang.org>
    Gerrit-Reviewer: Gopher Robot <go...@golang.org>
    Gerrit-Reviewer: Keith Randall <k...@google.com>
    Gerrit-Attention: Keith Randall <k...@google.com>
    Gerrit-Attention: Filippo Valsorda <fil...@golang.org>
    Gerrit-Attention: Cherry Mui <cher...@google.com>
    Gerrit-Comment-Date: Thu, 25 Aug 2022 03:29:16 +0000
    Gerrit-HasComments: Yes
    Gerrit-Has-Labels: No
    Gerrit-MessageType: comment

    Eric Fang (Gerrit)

    unread,
    Sep 29, 2022, 11:08:41 PM9/29/22
    to Ruinan Sun, goph...@pubsubhelper.golang.org, Filippo Valsorda, Gopher Robot, Cherry Mui, Keith Randall, golang-co...@googlegroups.com

    Attention is currently required from: Cherry Mui, Filippo Valsorda, Keith Randall, Ruinan Sun.

    Patch set 2:Run-TryBot +1

    View Change

      To view, visit change 410348. To unsubscribe, or for help writing mail filters, visit settings.

      Gerrit-Project: go
      Gerrit-Branch: master
      Gerrit-Change-Id: I900e65b0d76ff86681a06ccd17d278f2990d0354
      Gerrit-Change-Number: 410348
      Gerrit-PatchSet: 2
      Gerrit-Owner: Ruinan Sun <Ruina...@arm.com>
      Gerrit-Reviewer: Cherry Mui <cher...@google.com>
      Gerrit-Reviewer: Eric Fang <eric...@arm.com>
      Gerrit-Reviewer: Filippo Valsorda <fil...@golang.org>
      Gerrit-Reviewer: Gopher Robot <go...@golang.org>
      Gerrit-Reviewer: Keith Randall <k...@google.com>
      Gerrit-Attention: Keith Randall <k...@google.com>
      Gerrit-Attention: Ruinan Sun <Ruina...@arm.com>
      Gerrit-Attention: Filippo Valsorda <fil...@golang.org>
      Gerrit-Attention: Cherry Mui <cher...@google.com>
      Gerrit-Comment-Date: Fri, 30 Sep 2022 03:08:34 +0000
      Reply all
      Reply to author
      Forward
      0 new messages