Ted Painter has uploaded this change for review.
crypto/sha256: add sha-ni implementation
Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0
---
M src/crypto/sha256/sha256block_amd64.go
M src/crypto/sha256/sha256block_amd64.s
2 files changed, 154 insertions(+), 2 deletions(-)
diff --git a/src/crypto/sha256/sha256block_amd64.go b/src/crypto/sha256/sha256block_amd64.go
index 27464e2..b5d2c9b 100644
--- a/src/crypto/sha256/sha256block_amd64.go
+++ b/src/crypto/sha256/sha256block_amd64.go
@@ -7,3 +7,4 @@
import "internal/cpu"
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
+var useSHA = useAVX2 && cpu.X86.HasSHA
diff --git a/src/crypto/sha256/sha256block_amd64.s b/src/crypto/sha256/sha256block_amd64.s
index f6af47c..c0ed494 100644
--- a/src/crypto/sha256/sha256block_amd64.s
+++ b/src/crypto/sha256/sha256block_amd64.s
@@ -550,9 +550,80 @@
; \
ADDL y3, h // h = t1 + S0 + MAJ // --
+// Definitions for sha-ni version
+//
+// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
+// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
+//
+// Reference
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+//
+
+#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7
+#define dataPtr SI // input, base pointer to first input data block
+#define numBytes DX // input, number of input bytes to be processed
+#define sha256Constants AX // round contants from K256 table, indexed by round number x 32
+#define msg X0 // input data
+#define state0 X1 // round intermediates and outputs
+#define state1 X2
+#define m0 X3 // m0, m1,... m4 -- round message temps
+#define m1 X4
+#define m2 X5
+#define m3 X6
+#define m4 X7
+#define shufMask X8 // input data endian conversion control mask
+#define abefSave X9 // digest hash vector inter-block buffer abef
+#define cdghSave X10 // digest hash vector inter-block buffer cdgh
+
+#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds
+
+#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it
+ SHA256MSG1 m, a
+
+#define vmov(a,b) \ // msg copy for all but rounds 12-15
+ VMOVDQA a, b
+
+#define vmovrev(a,b) \ // reverse copy for rounds 12-15
+ VMOVDQA b, a
+
+// sha rounds 0 to 11
+// identical with the exception of the final msg op
+// which is replaced with a nop for rounds where it is not needed
+// refer to Gulley, et al for more information
+#define rounds0to11(m,a,c,sha256Msg1) \
+ VMOVDQU c*16(dataPtr), msg \
+ PSHUFB shufMask, msg \
+ VMOVDQA msg, m \
+ PADDD (c*32)(sha256Constants), msg \
+ SHA256RNDS2 msg, state0, state1 \
+ PSHUFD $0x0e, msg, msg \
+ SHA256RNDS2 msg, state1, state0 \
+ sha256Msg1 (m,a)
+
+// sha rounds 12 to 59
+// identical with the exception of the final msg op
+// and the reverse copy(m,msg) in round 12 which is required
+// after the last data load
+// refer to Gulley, et al for more information
+#define rounds12to59(m,c,a,t,sha256Msg1,movop) \
+ movop (m,msg) \
+ PADDD (c*32)(sha256Constants), msg \
+ SHA256RNDS2 msg, state0, state1 \
+ VMOVDQA m, m4 \
+ PALIGNR $4, a, m4 \
+ PADDD m4, t \
+ SHA256MSG2 m, t \
+ PSHUFD $0x0e, msg, msg \
+ SHA256RNDS2 msg, state1, state0 \
+ sha256Msg1 (m,a)
+
TEXT ·block(SB), 0, $536-32
- CMPB ·useAVX2(SB), $1
- JE avx2
+ CMPB ·useSHA(SB), $1
+ JE sha_ni
+ CMPB ·useAVX2(SB), $1
+ JE avx2
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
@@ -862,6 +933,77 @@
VZEROUPPER
RET
+sha_ni:
+ MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer
+ MOVQ p_base+8(FP), dataPtr // init input data base pointer
+ MOVQ p_len+16(FP), numBytes // get number of input bytes to hash
+ SHRQ $6, numBytes // force modulo 64 input buffer length
+ SHLQ $6, numBytes
+ CMPQ numBytes, $0 // exit early for zero-length input buffer
+ JEQ done
+ ADDQ dataPtr, numBytes // point numBytes to end of input buffer
+ VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder
+ VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH
+ PSHUFD $0xb1, state0, state0 // CDAB
+ PSHUFD $0x1b, state1, state1 // EFGH
+ VMOVDQA state0, m4
+ PALIGNR $8, state1, state0 // ABEF
+ PBLENDW $0xf0, m4, state1 // CDGH
+ VMOVDQA flip_mask<>(SB), shufMask
+ LEAQ K256<>(SB), sha256Constants
+
+roundLoop:
+ // save hash values for addition after rounds
+ VMOVDQA state0, abefSave
+ VMOVDQA state1, cdghSave
+
+ // do rounds 0-59
+ rounds0to11 (m0,-,0,nop) // 0-3
+ rounds0to11 (m1,m0,1,sha256msg1) // 4-7
+ rounds0to11 (m2,m1,2,sha256msg1) // 8-11
+ VMOVDQU (3*16)(dataPtr), msg
+ PSHUFB shufMask, msg
+ rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
+ rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19
+ rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23
+ rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27
+ rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31
+ rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35
+ rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39
+ rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43
+ rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47
+ rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51
+ rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55
+ rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59
+
+ // do rounds 60-63
+ VMOVDQA m3, msg
+ PADDD (15*32)(sha256Constants), msg
+ SHA256RNDS2 msg, state0, state1
+ PSHUFD $0x0e, msg, msg
+ SHA256RNDS2 msg, state1, state0
+
+ // add current hash values with previously saved
+ PADDD abefSave, state0
+ PADDD cdghSave, state1
+
+ // advance data pointer; loop until buffer empty
+ ADDQ $64, dataPtr
+ CMPQ numBytes, dataPtr
+ JNE roundLoop
+
+ // write hash values back in the correct order
+ PSHUFD $0x1b, state0, state0 // FEBA
+ PSHUFD $0xb1, state1, state1 // DCHG
+ VMOVDQA state0, m4
+ PBLENDW $0xf0, state1, state0 // DCBA
+ PALIGNR $8, m4, state1 // HGFE
+ VMOVDQU state0, (0*16)(digestPtr)
+ VMOVDQU state1, (1*16)(digestPtr)
+
+done:
+ RET
+
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
To view, visit change 408795. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Filippo Valsorda, Ted Painter.
1 comment:
Patchset:
Any info regarding when this patch will be merged? There are multiple open source projects that would greatly benefit from this, like containerd when pulling images, k8s and others.
To view, visit change 408795. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Filippo Valsorda, Ted Painter.
Patch set 1:Code-Review +1
2 comments:
Patchset:
This looks good for me, apart from some minor formatting issues.
I have tested this on i7-11800H and it worked as expected, with the performance improvements below:
```
goos: linux
goarch: amd64
cpu: 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz
Benchmark_SHA256_BEFORE-16 419570 2765 ns/op
Benchmark_SHA256_AFTER-16 1487156 786 ns/op
```
File src/crypto/sha256/sha256block_amd64.s:
The trailing spaces should be trimmed (here and in the other parts of the diff).
To view, visit change 408795. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Filippo Valsorda, Ted Painter.
Ted Painter uploaded patch set #2 to this change.
crypto/sha256: add sha-ni implementation
Fixes #957
Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0
---
M src/crypto/sha256/sha256block_amd64.go
M src/crypto/sha256/sha256block_amd64.s
2 files changed, 156 insertions(+), 2 deletions(-)
To view, visit change 408795. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Filippo Valsorda, Ted Painter.
1 comment:
Patchset:
Thanks for sharing this CL. Is this stalled for want of a reviewer, or is it waiting for the proposal approval (or something else)?
To view, visit change 408795. To unsubscribe, or for help writing mail filters, visit settings.
Attention is currently required from: Filippo Valsorda, Paulo Gomes, Russ Cox, Ted Painter.
Patch set 4:Code-Review +1
Attention is currently required from: Filippo Valsorda, Paulo Gomes, Russ Cox, Ted Painter.
Patch set 4:Code-Review +2
Gopher Robot submitted this change.
crypto/sha256: add sha-ni implementation
goos: linux
goarch: amd64
pkg: crypto/sha256
cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
│ bench.old │ bench.new │
│ sec/op │ sec/op vs base │
Hash8Bytes/New-4 169.20n ± 7% 65.40n ± 5% -61.35% (p=0.000 n=10)
Hash8Bytes/Sum224-4 166.10n ± 3% 65.20n ± 8% -60.74% (p=0.000 n=10)
Hash8Bytes/Sum256-4 168.50n ± 6% 63.58n ± 7% -62.27% (p=0.000 n=10)
Hash1K/New-4 2275.5n ± 5% 618.5n ± 2% -72.82% (p=0.000 n=10)
Hash1K/Sum224-4 2364.5n ± 1% 618.1n ± 1% -73.86% (p=0.000 n=10)
Hash1K/Sum256-4 2338.5n ± 2% 613.0n ± 2% -73.79% (p=0.000 n=10)
Hash8K/New-4 17.530µ ± 2% 4.501µ ± 1% -74.33% (p=0.000 n=10)
Hash8K/Sum224-4 17.456µ ± 2% 4.505µ ± 1% -74.19% (p=0.000 n=10)
Hash8K/Sum256-4 17.417µ ± 2% 4.504µ ± 1% -74.14% (p=0.000 n=10)
geomean 1.897µ 564.3n -70.25%
│ bench.old │ bench.new │
│ B/s │ B/s vs base │
Hash8Bytes/New-4 45.11Mi ± 6% 116.66Mi ± 5% +158.62% (p=0.000 n=10)
Hash8Bytes/Sum224-4 45.92Mi ± 3% 117.04Mi ± 8% +154.89% (p=0.000 n=10)
Hash8Bytes/Sum256-4 45.29Mi ± 6% 120.00Mi ± 7% +164.99% (p=0.000 n=10)
Hash1K/New-4 429.2Mi ± 5% 1578.9Mi ± 2% +267.92% (p=0.000 n=10)
Hash1K/Sum224-4 413.0Mi ± 1% 1579.8Mi ± 1% +282.49% (p=0.000 n=10)
Hash1K/Sum256-4 417.6Mi ± 1% 1593.1Mi ± 2% +281.53% (p=0.000 n=10)
Hash8K/New-4 445.7Mi ± 1% 1735.9Mi ± 1% +289.50% (p=0.000 n=10)
Hash8K/Sum224-4 447.6Mi ± 2% 1734.5Mi ± 1% +287.54% (p=0.000 n=10)
Hash8K/Sum256-4 448.6Mi ± 2% 1734.8Mi ± 1% +286.75% (p=0.000 n=10)
geomean 204.3Mi 686.8Mi +236.11%
│ bench.old │ bench.new │
│ B/op │ B/op vs base │
Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash1K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash1K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
geomean ² +0.00% ²
¹ all samples are equal
² summaries must be >0 to compute geomean
│ bench.old │ bench.new │
│ allocs/op │ allocs/op vs base │
Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash1K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash1K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
Hash8K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹
geomean ² +0.00% ²
¹ all samples are equal
² summaries must be >0 to compute geomean
Fixes #50543.
Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0
Reviewed-on: https://go-review.googlesource.com/c/go/+/408795
Reviewed-by: Paulo Gomes <paulo.g...@gmail.com>
TryBot-Result: Gopher Robot <go...@golang.org>
Run-TryBot: Russ Cox <r...@golang.org>
Reviewed-by: Alan Donovan <adon...@google.com>
Auto-Submit: Russ Cox <r...@golang.org>
Reviewed-by: Russ Cox <r...@golang.org>
---
M src/crypto/sha256/sha256block_amd64.go
M src/crypto/sha256/sha256block_amd64.s
2 files changed, 152 insertions(+), 9 deletions(-)
diff --git a/src/crypto/sha256/sha256block_amd64.go b/src/crypto/sha256/sha256block_amd64.go
index 27464e2..b5d2c9b 100644
--- a/src/crypto/sha256/sha256block_amd64.go
+++ b/src/crypto/sha256/sha256block_amd64.go
@@ -7,3 +7,4 @@
import "internal/cpu"
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
+var useSHA = useAVX2 && cpu.X86.HasSHA
diff --git a/src/crypto/sha256/sha256block_amd64.s b/src/crypto/sha256/sha256block_amd64.s
index f6af47c..03535fb 100644
--- a/src/crypto/sha256/sha256block_amd64.s
+++ b/src/crypto/sha256/sha256block_amd64.s
@@ -179,7 +179,7 @@
#define XFER Y9
-#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
+#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define X_BYTE_FLIP_MASK X13
#define NUM_BYTES DX
@@ -232,14 +232,14 @@
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- XORL g, y2; \ // y2 = f^g // CH
+ XORL g, y2; \ // y2 = f^g // CH
VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
; \
ANDL e, y2; \ // y2 = (f^g)&e // CH
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- ADDL h, d; \ // d = k + w + h + d // --
+ ADDL h, d; \ // d = k + w + h + d // --
; \
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
@@ -270,7 +270,7 @@
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
+ ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
@@ -316,7 +316,7 @@
; \
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
+ ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
; \
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
@@ -495,7 +495,7 @@
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
+ ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
@@ -531,7 +531,7 @@
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
+ ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
To view, visit change 408795. To unsubscribe, or for help writing mail filters, visit settings.