diff --git a/src/crypto/internal/fips140/sha512/sha512block_asm.go b/src/crypto/internal/fips140/sha512/sha512block_asm.go
index 5323451..351dde2 100644
--- a/src/crypto/internal/fips140/sha512/sha512block_asm.go
+++ b/src/crypto/internal/fips140/sha512/sha512block_asm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (loong64 || riscv64) && !purego
+//go:build loong64 && !purego
package sha512
diff --git a/src/crypto/internal/fips140/sha512/sha512block_riscv64.go b/src/crypto/internal/fips140/sha512/sha512block_riscv64.go
new file mode 100644
index 0000000..6696538
--- /dev/null
+++ b/src/crypto/internal/fips140/sha512/sha512block_riscv64.go
@@ -0,0 +1,39 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !purego
+
+package sha512
+
+import (
+ "crypto/internal/fips140deps/cpu"
+ "crypto/internal/impl"
+)
+
+// According to the manual 32.2.6:
+// SHA-512 implementations with VLEN < 256 require LMUL>1 to combine 64-bit elements
+// from register groups to provide all four elements of the element group.
+// TODO(mzh) implement vlen < 256
+var supportZvknhb = cpu.RISCV64HasZvknhb && cpu.RISCV64HasZvkb && getVlen() >= 256
+
+func init() {
+ impl.Register("sha512", "ZVKNHB", &supportZvknhb)
+}
+
+//go:noescape
+func getVlen() uint64
+
+//go:noescape
+func blockZvknhb(dig *Digest, p []byte)
+
+//go:noescape
+func blockScalar(dig *Digest, p []byte)
+
+func block(dig *Digest, p []byte) {
+ if supportZvknhb {
+ blockZvknhb(dig, p)
+ } else {
+ blockScalar(dig, p)
+ }
+}
diff --git a/src/crypto/internal/fips140/sha512/sha512block_riscv64.s b/src/crypto/internal/fips140/sha512/sha512block_riscv64.s
index f25ed62..e150705 100644
--- a/src/crypto/internal/fips140/sha512/sha512block_riscv64.s
+++ b/src/crypto/internal/fips140/sha512/sha512block_riscv64.s
@@ -149,8 +149,8 @@
MSGSCHEDULE1(index); \
SHA512ROUND(index, a, b, c, d, e, f, g, h)
-// func block(dig *Digest, p []byte)
-TEXT ·block(SB),0,$128-32
+// func blockScalar(dig *Digest, p []byte)
+TEXT ·blockScalar(SB),0,$128-32
MOV p_base+8(FP), X29
MOV p_len+16(FP), X30
SRL $7, X30
@@ -285,3 +285,148 @@
end:
RET
+
+// func getVlenb() uint64
+TEXT ·getVlen(SB),0,$0-8
+ CSRRS X0, VLENB, X10
+ SLLI $3, X10, X10
+ MOV X10, ret+0(FP)
+ RET
+
+// zvknb requires MessageSched plus Constants
+// VADDVV K[t:t+3], W[t:t+3], V6 // V6 = W[0] + K[0] ... W[3] + K[3]
+// VSHA2CLVV V6, V2, V3
+// VSHA2CHVV V6, V3, V2 // Update states
+//
+// Reuse W[t:t+3] for next schedule
+// V0 must be 0b0001, Notes from RISC-V manual 32.3.22
+// VMERGEVVM W[t+4:t+7], W[t+8:t+11], V0, V7
+// VSHA2MSVV W[t+12:t+15], V7, Wnext // Now Wnext = W[t+16:t+19]
+#define ZVKNBRound(K, WP0, WP4, WP8, WP12) \
+ VADDVV K, WP0, V6; \
+ VSHA2CLVV V6, V2, V3; \
+ VSHA2CHVV V6, V3, V2; \
+ VMERGEVVM WP4, WP8, V0, V7; \
+ VSHA2MSVV WP12, V7, WP0
+
+#define ZVKNBLastRound(K, WP0) \
+ VADDVV K, WP0, V6; \
+ VSHA2CLVV V6, V2, V3; \
+ VSHA2CHVV V6, V3, V2
+
+
+// func blockZvknhb(dig *Digest, p []byte)
+TEXT ·blockZvknhb(SB),0,$0-32
+ MOV dig+0(FP), X10
+ MOV p_base+8(FP), X11
+ MOV p_len+16(FP), X12
+
+ VSETIVLI $4, E64, M1, TA, MA, ZERO // require VL >= 256
+
+ VMVVI $1, V0 // Notes from RISC-V manual 32.3.22
+
+ MOV $state_mask<>+0(SB), X14
+ VLE64V (X14), V1
+ ADD $16, X10, X15
+ VLUXEI64V (X10), V1, V2 // V2 = F, E, B, A
+ VLUXEI64V (X15), V1, V3 // V3 = H, G, D, C
+
+ MOV $·_K(SB), X14
+ VLE64V (X14), V8
+ ADD $32, X14
+ VLE64V (X14), V9
+ ADD $32, X14
+ VLE64V (X14), V10
+ ADD $32, X14
+ VLE64V (X14), V11
+ ADD $32, X14
+ VLE64V (X14), V12
+ ADD $32, X14
+ VLE64V (X14), V13
+ ADD $32, X14
+ VLE64V (X14), V14
+ ADD $32, X14
+ VLE64V (X14), V15
+ ADD $32, X14
+ VLE64V (X14), V16
+ ADD $32, X14
+ VLE64V (X14), V17
+ ADD $32, X14
+ VLE64V (X14), V18
+ ADD $32, X14
+ VLE64V (X14), V19
+ ADD $32, X14
+ VLE64V (X14), V20
+ ADD $32, X14
+ VLE64V (X14), V21
+ ADD $32, X14
+ VLE64V (X14), V22
+ ADD $32, X14
+ VLE64V (X14), V23
+ ADD $32, X14
+ VLE64V (X14), V24
+ ADD $32, X14
+ VLE64V (X14), V25
+ ADD $32, X14
+ VLE64V (X14), V26
+ ADD $32, X14
+ VLE64V (X14), V27
+
+loop:
+ VMV2RV V2, V4 // V4, V5 = V2, V3, add later
+
+ VLE64V (X11), V28
+ VREV8V V28, V28
+ ADD $32, X11
+ VLE64V (X11), V29
+ VREV8V V29, V29
+ ADD $32, X11
+ VLE64V (X11), V30
+ VREV8V V30, V30
+ ADD $32, X11
+ VLE64V (X11), V31
+ VREV8V V31, V31
+ ADD $32, X11
+
+ ZVKNBRound( V8, V28, V29, V30, V31)
+ ZVKNBRound( V9, V29, V30, V31, V28)
+ ZVKNBRound(V10, V30, V31, V28, V29)
+ ZVKNBRound(V11, V31, V28, V29, V30)
+
+ ZVKNBRound(V12, V28, V29, V30, V31)
+ ZVKNBRound(V13, V29, V30, V31, V28)
+ ZVKNBRound(V14, V30, V31, V28, V29)
+ ZVKNBRound(V15, V31, V28, V29, V30)
+
+ ZVKNBRound(V16, V28, V29, V30, V31)
+ ZVKNBRound(V17, V29, V30, V31, V28)
+ ZVKNBRound(V18, V30, V31, V28, V29)
+ ZVKNBRound(V19, V31, V28, V29, V30)
+
+ ZVKNBRound(V20, V28, V29, V30, V31)
+ ZVKNBRound(V21, V29, V30, V31, V28)
+ ZVKNBRound(V22, V30, V31, V28, V29)
+ ZVKNBRound(V23, V31, V28, V29, V30)
+
+ ZVKNBLastRound(V24, V28)
+ ZVKNBLastRound(V25, V29)
+ ZVKNBLastRound(V26, V30)
+ ZVKNBLastRound(V27, V31)
+
+ VADDVV V4, V2, V2
+ VADDVV V5, V3, V3
+
+ SUB $128, X12
+ BNEZ X12, loop
+
+ VSUXEI64V V2, V1, (X10)
+ VSUXEI64V V3, V1, (X15)
+
+end:
+ RET
+
+DATA state_mask<>+0(SB)/8, $40
+DATA state_mask<>+8(SB)/8, $32
+DATA state_mask<>+16(SB)/8, $8
+DATA state_mask<>+24(SB)/8, $0
+GLOBL state_mask<>(SB), RODATA, $32
diff --git a/src/crypto/internal/fips140deps/cpu/cpu.go b/src/crypto/internal/fips140deps/cpu/cpu.go
index 9ac7e51..88aafb4 100644
--- a/src/crypto/internal/fips140deps/cpu/cpu.go
+++ b/src/crypto/internal/fips140deps/cpu/cpu.go
@@ -29,6 +29,7 @@
RISCV64HasV = cpu.RISCV64.HasV
RISCV64HasZvknha = cpu.RISCV64.HasZvknha
+ RISCV64HasZvknhb = cpu.RISCV64.HasZvknhb
RISCV64HasZvkb = cpu.RISCV64.HasZvkb
S390XHasAES = cpu.S390X.HasAES