[go] encoding/hex: add SIMD encoding and decoding

0 views
Skip to first unread message

Илья (Gerrit)

unread,
Jun 30, 2026, 4:15:07 PM (8 hours ago) Jun 30
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Илья has uploaded the change for review

Commit message

encoding/hex: add SIMD encoding and decoding
Change-Id: Ic21fa6f0f864d825cfcf45e3d5c7a3bcbdc6e77f

Change diff

diff --git a/src/encoding/hex/hex.go b/src/encoding/hex/hex.go
index 4499974..838758b 100644
--- a/src/encoding/hex/hex.go
+++ b/src/encoding/hex/hex.go
@@ -43,8 +43,9 @@
// of bytes written to dst, but this value is always [EncodedLen](len(src)).
// Encode implements hexadecimal encoding.
func Encode(dst, src []byte) int {
- j := 0
- for _, v := range src {
+ processed := encodeSIMD(dst, src)
+ j := processed * 2
+ for _, v := range src[processed:] {
dst[j] = hextable[v>>4]
dst[j+1] = hextable[v&0x0f]
j += 2
@@ -85,7 +86,8 @@
// If the input is malformed, Decode returns the number
// of bytes decoded before the error.
func Decode(dst, src []byte) (int, error) {
- i, j := 0, 0
+ processed := decodeSIMD(dst, src)
+ i, j := processed/2, processed
for ; j < len(src)-1; j += 2 {
p := src[j]
q := src[j+1]
diff --git a/src/encoding/hex/hex_fallback.go b/src/encoding/hex/hex_fallback.go
new file mode 100644
index 0000000..52e5371
--- /dev/null
+++ b/src/encoding/hex/hex_fallback.go
@@ -0,0 +1,15 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.simd || (!arm64 && !amd64 && !wasm)
+
+package hex
+
+func encodeSIMD(dst, src []byte) int {
+ return 0
+}
+
+func decodeSIMD(dst, src []byte) int {
+ return 0
+}
diff --git a/src/encoding/hex/hex_simd_amd64.go b/src/encoding/hex/hex_simd_amd64.go
new file mode 100644
index 0000000..8b4f95c
--- /dev/null
+++ b/src/encoding/hex/hex_simd_amd64.go
@@ -0,0 +1,320 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package hex
+
+import (
+ "internal/byteorder"
+ "simd/archsimd"
+)
+
+const (
+ simdBlockSize = 16
+ avx2BlockSize = 32
+)
+
+var (
+ hexTable128 = [simdBlockSize]byte{
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ }
+ // AVX2 byte permutations operate independently on each 128-bit lane.
+ hexTable256 = [avx2BlockSize]byte{
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ }
+)
+
+// Loading constants from memory keeps the 128-bit path compatible with AVX-only
+// CPUs; the corresponding archsimd broadcast operations require AVX2.
+var (
+ lowNibbleMask128 = [simdBlockSize]byte{
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ }
+ lowNibbleMask256 = [avx2BlockSize]byte{
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ }
+ lowByteMask128 = [simdBlockSize / 2]uint16{
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ }
+ lowByteMask256 = [avx2BlockSize / 2]uint16{
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ }
+ decodePackMask128 = [simdBlockSize]int8{
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }
+ decodePackMask256 = [avx2BlockSize]int8{
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }
+ decodePairWeights128 = [simdBlockSize]int8{
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ }
+ decodePairWeights256 = [avx2BlockSize]int8{
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ }
+ letterNibbleOffset128 = [simdBlockSize]byte{
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ }
+ letterNibbleOffset256 = [avx2BlockSize]byte{
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ }
+ digitRangeBefore128 = [simdBlockSize]int8{
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ }
+ digitRangeBefore256 = [avx2BlockSize]int8{
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ }
+ digitRangeAfter128 = [simdBlockSize]int8{
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ }
+ digitRangeAfter256 = [avx2BlockSize]int8{
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ }
+ lowerLetterRangeBefore128 = [simdBlockSize]int8{
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ }
+ lowerLetterRangeBefore256 = [avx2BlockSize]int8{
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ }
+ lowerLetterRangeAfter128 = [simdBlockSize]int8{
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ }
+ lowerLetterRangeAfter256 = [avx2BlockSize]int8{
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ }
+ normalizeLetterValue128 = [simdBlockSize]int8{
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ }
+ normalizeLetterValue256 = [avx2BlockSize]int8{
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ }
+)
+
+// encodeSIMD encodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func encodeSIMD(dst, src []byte) int {
+ if len(src) < simdBlockSize || !archsimd.X86.AVX() {
+ return 0
+ }
+
+ processed := 0
+ useAVX2 := archsimd.X86.AVX2() && len(src) >= avx2BlockSize
+ if useAVX2 {
+ for len(src) >= avx2BlockSize {
+ encodedLo, encodedHi := encodeBlockAVX2(archsimd.LoadUint8x32(src))
+ encodedLo.Store(dst[:avx2BlockSize])
+ encodedHi.Store(dst[avx2BlockSize : 2*avx2BlockSize])
+
+ src = src[avx2BlockSize:]
+ dst = dst[2*avx2BlockSize:]
+ processed += avx2BlockSize
+ }
+ }
+
+ for len(src) >= simdBlockSize {
+ encodedLo, encodedHi := encodeBlock(archsimd.LoadUint8x16(src))
+ encodedLo.Store(dst[:simdBlockSize])
+ encodedHi.Store(dst[simdBlockSize : 2*simdBlockSize])
+
+ src = src[simdBlockSize:]
+ dst = dst[2*simdBlockSize:]
+ processed += simdBlockSize
+ }
+
+ if useAVX2 {
+ archsimd.ClearAVXUpperBits()
+ }
+ return processed
+}
+
+// encodeBlock encodes one 128-bit vector into two consecutive vectors of
+// hexadecimal digits using AVX instructions.
+func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
+ nibbleMask := archsimd.LoadUint8x16Array(&lowNibbleMask128)
+ lowNibbles := x.And(nibbleMask)
+ highNibbles := x.AsUint16x8().ShiftAllRight(4).AsUint8x16().And(nibbleMask)
+
+ hexTable := archsimd.LoadUint8x16Array(&hexTable128)
+ lowDigits := hexTable.PermuteOrZero(lowNibbles.AsInt8x16())
+ highDigits := hexTable.PermuteOrZero(highNibbles.AsInt8x16())
+
+ lowByteMask := archsimd.LoadUint16x8Array(&lowByteMask128)
+ highWords := highDigits.AsUint16x8()
+ lowWords := lowDigits.AsUint16x8()
+ evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
+ oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))
+
+ return evenPairs.InterleaveLo(oddPairs).AsUint8x16(), evenPairs.InterleaveHi(oddPairs).AsUint8x16()
+}
+
+// encodeBlockAVX2 encodes one 256-bit vector into two consecutive vectors of
+// hexadecimal digits.
+func encodeBlockAVX2(x archsimd.Uint8x32) (encodedLo, encodedHi archsimd.Uint8x32) {
+ nibbleMask := archsimd.LoadUint8x32Array(&lowNibbleMask256)
+ lowNibbles := x.And(nibbleMask)
+ highNibbles := x.AsUint16x16().ShiftAllRight(4).AsUint8x32().And(nibbleMask)
+
+ hexTable := archsimd.LoadUint8x32Array(&hexTable256)
+ lowDigits := hexTable.PermuteOrZeroGrouped(lowNibbles.AsInt8x32())
+ highDigits := hexTable.PermuteOrZeroGrouped(highNibbles.AsInt8x32())
+
+ lowByteMask := archsimd.LoadUint16x16Array(&lowByteMask256)
+ highWords := highDigits.AsUint16x16()
+ lowWords := lowDigits.AsUint16x16()
+ evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
+ oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))
+
+ interleavedLo := evenPairs.InterleaveLoGrouped(oddPairs)
+ interleavedHi := evenPairs.InterleaveHiGrouped(oddPairs)
+ encodedLo = interleavedLo.ConcatPermute128Scalars(0, 2, interleavedHi).AsUint8x32()
+ encodedHi = interleavedLo.ConcatPermute128Scalars(1, 3, interleavedHi).AsUint8x32()
+ return encodedLo, encodedHi
+}
+
+// decodeSIMD decodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func decodeSIMD(dst, src []byte) (processed int) {
+ if len(src) < simdBlockSize || !archsimd.X86.AVX() {
+ return 0
+ }
+
+ useAVX2 := archsimd.X86.AVX2() && len(src) >= avx2BlockSize
+ if useAVX2 {
+ for len(src) >= avx2BlockSize {
+ if invalid := decodeBlockAVX2(archsimd.LoadUint8x32(src), dst); invalid {
+ archsimd.ClearAVXUpperBits()
+ return processed
+ }
+ src = src[avx2BlockSize:]
+ dst = dst[avx2BlockSize/2:]
+ processed += avx2BlockSize
+ }
+ }
+
+ for len(src) >= simdBlockSize {
+ if invalid := decodeBlock(archsimd.LoadUint8x16(src), dst); invalid {
+ if useAVX2 {
+ archsimd.ClearAVXUpperBits()
+ }
+ return processed
+ }
+ src = src[simdBlockSize:]
+ dst = dst[simdBlockSize/2:]
+ processed += simdBlockSize
+ }
+
+ if useAVX2 {
+ archsimd.ClearAVXUpperBits()
+ }
+ return processed
+}
+
+// decodeBlock decodes one 128-bit vector of hexadecimal digits into eight bytes.
+// It returns whether the input contained an invalid byte.
+func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
+ c := input.BitsToInt8()
+ digitRangeBefore := archsimd.LoadInt8x16Array(&digitRangeBefore128)
+ digitRangeAfter := archsimd.LoadInt8x16Array(&digitRangeAfter128)
+ lowerLetterRangeBefore := archsimd.LoadInt8x16Array(&lowerLetterRangeBefore128)
+ lowerLetterRangeAfter := archsimd.LoadInt8x16Array(&lowerLetterRangeAfter128)
+ lower := input.Or(archsimd.LoadInt8x16Array(&normalizeLetterValue128).AsUint8x16())
+ lowerC := lower.BitsToInt8()
+
+ // Use strict bounds because signed Greater maps directly to VPCMPGTB,
+ // while inclusive comparisons require multiple instructions.
+ isDigit := c.Greater(digitRangeBefore).
+ And(digitRangeAfter.Greater(c))
+ isLetter := lowerC.Greater(lowerLetterRangeBefore).
+ And(lowerLetterRangeAfter.Greater(lowerC))
+ isValid := isDigit.Or(isLetter)
+
+ if isValid.ToBits() != 0xffff {
+ return true
+ }
+
+ nibble := input.And(archsimd.LoadUint8x16Array(&lowNibbleMask128)).
+ Add(isLetter.ToInt8x16().ToBits().And(archsimd.LoadUint8x16Array(&letterNibbleOffset128)))
+
+ decodeNibbles(nibble, dst)
+ return false
+}
+
+// decodeBlockAVX2 decodes one 256-bit vector of hexadecimal digits into
+// sixteen bytes using AVX2 instructions.
+// It returns whether the input contained an invalid byte.
+func decodeBlockAVX2(input archsimd.Uint8x32, dst []byte) (invalid bool) {
+ c := input.BitsToInt8()
+ lower := input.Or(archsimd.LoadInt8x32Array(&normalizeLetterValue256).AsUint8x32())
+ lowerC := lower.BitsToInt8()
+ // See the 128-bit path for why these use strict bounds.
+ isDigit := c.Greater(archsimd.LoadInt8x32Array(&digitRangeBefore256)).
+ And(archsimd.LoadInt8x32Array(&digitRangeAfter256).Greater(c))
+ isLetter := lowerC.Greater(archsimd.LoadInt8x32Array(&lowerLetterRangeBefore256)).
+ And(archsimd.LoadInt8x32Array(&lowerLetterRangeAfter256).Greater(lowerC))
+ isValid := isDigit.Or(isLetter)
+
+ if isValid.ToBits() != 0xffffffff {
+ return true
+ }
+
+ nibble := input.And(archsimd.LoadUint8x32Array(&lowNibbleMask256)).
+ Add(isLetter.ToInt8x32().ToBits().And(archsimd.LoadUint8x32Array(&letterNibbleOffset256)))
+
+ packed := nibble.DotProductPairsSaturated(archsimd.LoadInt8x32Array(&decodePairWeights256)).
+ ToBits().AsUint8x32().
+ PermuteOrZeroGrouped(archsimd.LoadInt8x32Array(&decodePackMask256))
+ decoded := packed.GetLo().AsUint64x2().
+ ConcatPermuteScalars(0, 2, packed.GetHi().AsUint64x2()).AsUint8x16()
+ decoded.Store(dst[:16])
+ return false
+}
+
+// decodeNibbles combines 16 nibble values into 8 decoded bytes.
+func decodeNibbles(nibble archsimd.Uint8x16, dst []byte) {
+ packed := nibble.DotProductPairsSaturated(archsimd.LoadInt8x16Array(&decodePairWeights128)).
+ ToBits().AsUint8x16().
+ PermuteOrZero(archsimd.LoadInt8x16Array(&decodePackMask128))
+ byteorder.LEPutUint64(dst, packed.AsUint64x2().GetElem(0))
+}
diff --git a/src/encoding/hex/hex_simd_arm64.go b/src/encoding/hex/hex_simd_arm64.go
new file mode 100644
index 0000000..a7d8d93
--- /dev/null
+++ b/src/encoding/hex/hex_simd_arm64.go
@@ -0,0 +1,89 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && arm64
+
+package hex
+
+import (
+ "internal/byteorder"
+ "simd/archsimd"
+)
+
+const simdBlockSize = 16
+
+var hexTableVector = archsimd.LoadUint8x16([]byte(hextable))
+
+const lowNibbleMask = 0x0f
+const lowLetter = 0x20 // 'A'...'F' -> 'a'...'f'.
+const digitRangeStart = byte('0')
+const digitRangeSize = byte('9') - digitRangeStart + 1
+const letterRangeStart = byte('a')
+const letterRangeSize = byte('f') - letterRangeStart + 1
+const letterNibbleOffset = 9
+
+// encodeSIMD encodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func encodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ encodedLo, encodedHi := encodeBlock(archsimd.LoadUint8x16(src))
+ encodedLo.Store(dst[:simdBlockSize])
+ encodedHi.Store(dst[simdBlockSize : 2*simdBlockSize])
+
+ src = src[simdBlockSize:]
+ dst = dst[2*simdBlockSize:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// encodeBlock encodes one vector into two consecutive vectors of hexadecimal
+// digits.
+func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
+ lowNibbles := x.And(archsimd.BroadcastUint8x16(lowNibbleMask))
+ highNibbles := x.ShiftAllRight(4)
+
+ lowDigits := hexTableVector.LookupOrZero(lowNibbles)
+ highDigits := hexTableVector.LookupOrZero(highNibbles)
+
+ return highDigits.InterleaveLo(lowDigits), highDigits.InterleaveHi(lowDigits)
+}
+
+// decodeSIMD decodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func decodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ if invalid := decodeBlock(archsimd.LoadUint8x16(src), dst); invalid {
+ return processed
+ }
+ src = src[simdBlockSize:]
+ dst = dst[simdBlockSize/2:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// decodeBlock decodes one vector of hexadecimal digits into eight bytes.
+// It returns whether the input contained an invalid byte.
+func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
+ lower := input.Or(archsimd.BroadcastUint8x16(lowLetter)) // 'A'...'F' -> 'a'...'f'.
+ isDigit := input.Sub(archsimd.BroadcastUint8x16(digitRangeStart)).
+ Less(archsimd.BroadcastUint8x16(digitRangeSize))
+ isLetter := lower.Sub(archsimd.BroadcastUint8x16(letterRangeStart)).
+ Less(archsimd.BroadcastUint8x16(letterRangeSize))
+ isValid := isDigit.Or(isLetter)
+
+ if isValid.ToInt8x16().ToBits().ReduceMin() == 0 {
+ return true
+ }
+
+ nibble := input.And(archsimd.BroadcastUint8x16(lowNibbleMask)).
+ Add(isLetter.ToInt8x16().ToBits().And(archsimd.BroadcastUint8x16(letterNibbleOffset))).BitsToInt8()
+
+ evenNibbles := nibble.ConcatEven(nibble)
+ oddNibbles := nibble.ConcatOdd(nibble)
+ result := evenNibbles.ShiftAllLeft(4).Or(oddNibbles).ToBits().ReshapeToUint64s()
+ byteorder.LEPutUint64(dst, result.GetElem(0))
+ return false
+}
diff --git a/src/encoding/hex/hex_simd_test.go b/src/encoding/hex/hex_simd_test.go
new file mode 100644
index 0000000..d67af3f
--- /dev/null
+++ b/src/encoding/hex/hex_simd_test.go
@@ -0,0 +1,204 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && (arm64 || amd64 || wasm)
+
+package hex
+
+import (
+ "bytes"
+ "strconv"
+ "testing"
+)
+
+func TestEncodeSIMDBoundaries(t *testing.T) {
+ src := make([]byte, 257)
+ for i := range src {
+ src[i] = byte(i)
+ }
+
+ sizes := []int{
+ 0, 1,
+ 15, 16, 17,
+ 31, 32, 33,
+ 47, 48, 49,
+ 63, 64, 65,
+ 127, 128, 129,
+ 255, 256, 257,
+ }
+
+ const guardSize = 16
+ wantGuard := bytes.Repeat([]byte{0xa5}, guardSize)
+ for _, size := range sizes {
+ t.Run(strconv.Itoa(size), func(t *testing.T) {
+ encodedLen := EncodedLen(size)
+ backing := bytes.Repeat([]byte{0xa5}, encodedLen+2*guardSize)
+ dst := backing[guardSize : guardSize+encodedLen]
+
+ if got := Encode(dst, src[:size]); got != encodedLen {
+ t.Fatalf("Encode returned %d, want %d", got, encodedLen)
+ }
+ if want := encodeScalarForTest(src[:size]); !bytes.Equal(dst, want) {
+ t.Fatalf("Encode returned %q, want %q", dst, want)
+ }
+ if !bytes.Equal(backing[:guardSize], wantGuard) {
+ t.Fatal("Encode wrote before dst")
+ }
+ if !bytes.Equal(backing[guardSize+encodedLen:], wantGuard) {
+ t.Fatal("Encode wrote past dst")
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDBoundaries(t *testing.T) {
+ sizes := []int{
+ 0, 2,
+ 14, 16, 18,
+ 30, 32, 34,
+ 46, 48, 50,
+ 62, 64, 66,
+ 126, 128, 130,
+ 254, 256, 258,
+ }
+
+ const guardSize = 16
+ wantGuard := bytes.Repeat([]byte{0xa5}, guardSize)
+ for _, size := range sizes {
+ t.Run(strconv.Itoa(size), func(t *testing.T) {
+ const hexChars = "0123456789abcdefABCDEF"
+ src := make([]byte, size)
+ for i := range src {
+ src[i] = hexChars[i%len(hexChars)]
+ }
+ decodedLen := DecodedLen(size)
+
+ backing := bytes.Repeat([]byte{0xa5}, decodedLen+2*guardSize)
+ dst := backing[guardSize : guardSize+decodedLen]
+
+ n, err := Decode(dst, src)
+ if err != nil {
+ t.Fatalf("Decode returned error: %v", err)
+ }
+ if n != decodedLen {
+ t.Fatalf("Decode returned %d, want %d", n, decodedLen)
+ }
+ if want := decodeScalarForTest(src); !bytes.Equal(dst, want) {
+ t.Fatalf("Decode returned %q, want %q", dst, want)
+ }
+ if !bytes.Equal(backing[:guardSize], wantGuard) {
+ t.Fatal("Decode wrote before dst")
+ }
+ if !bytes.Equal(backing[guardSize+decodedLen:], wantGuard) {
+ t.Fatal("Decode wrote past dst")
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDOddLength(t *testing.T) {
+ for _, size := range []int{1, 15, 17, 31, 33, 47, 49} {
+ t.Run(strconv.Itoa(size), func(t *testing.T) {
+ src := bytes.Repeat([]byte{'0'}, size)
+ dst := bytes.Repeat([]byte{0xa5}, DecodedLen(size))
+
+ n, err := Decode(dst, src)
+ if err != ErrLength {
+ t.Fatalf("Decode returned error %v, want ErrLength", err)
+ }
+ if n != DecodedLen(size) {
+ t.Fatalf("Decode returned %d, want %d", n, DecodedLen(size))
+ }
+ if want := make([]byte, n); !bytes.Equal(dst, want) {
+ t.Fatalf("Decode returned %q, want %q", dst, want)
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDErrors(t *testing.T) {
+ tests := []struct {
+ size int
+ pos int
+ }{
+ {32, 0},
+ {32, 1},
+ {32, 2},
+ {32, 14},
+ {32, 15},
+ {32, 16},
+ {32, 17},
+ {32, 18},
+ {32, 30},
+ {32, 31},
+ {33, 32},
+ }
+ for _, test := range tests {
+ name := strconv.Itoa(test.pos) + "/" + strconv.Itoa(test.size)
+ t.Run(name, func(t *testing.T) {
+ src := bytes.Repeat([]byte{'0'}, test.size)
+ src[test.pos] = 'z'
+ dst := bytes.Repeat([]byte{0xa5}, DecodedLen(test.size))
+
+ n, err := Decode(dst, src)
+ if err != InvalidByteError('z') {
+ t.Fatalf("Decode returned error %v, want InvalidByteError(%q)", err, 'z')
+ }
+ wantN := test.pos / 2
+ if n != wantN {
+ t.Fatalf("Decode returned %d, want %d", n, wantN)
+ }
+ if want := make([]byte, n); !bytes.Equal(dst[:n], want) {
+ t.Fatalf("Decode returned %q, want %q", dst[:n], want)
+ }
+ if want := bytes.Repeat([]byte{0xa5}, len(dst)-n); !bytes.Equal(dst[n:], want) {
+ t.Fatalf("Decode wrote past the valid prefix: got %x, want %x", dst[n:], want)
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDAllBytes(t *testing.T) {
+ for _, pos := range []int{0, 15, 16, 31} {
+ for value := 0; value < 256; value++ {
+ c := byte(value)
+ src := bytes.Repeat([]byte{'0'}, 32)
+ src[pos] = c
+ dst := bytes.Repeat([]byte{0xa5}, DecodedLen(len(src)))
+
+ n, err := Decode(dst, src)
+ valid := '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F'
+ if valid {
+ if err != nil || n != len(dst) {
+ t.Fatalf("Decode with byte %#02x at %d returned %d, %v", c, pos, n, err)
+ }
+ if want := decodeScalarForTest(src); !bytes.Equal(dst, want) {
+ t.Fatalf("Decode with byte %#02x at %d returned %x, want %x", c, pos, dst, want)
+ }
+ continue
+ }
+
+ if err != InvalidByteError(c) || n != pos/2 {
+ t.Fatalf("Decode with byte %#02x at %d returned %d, %v; want %d, InvalidByteError", c, pos, n, err, pos/2)
+ }
+ }
+ }
+}
+
+func encodeScalarForTest(src []byte) []byte {
+ dst := make([]byte, EncodedLen(len(src)))
+ for i, b := range src {
+ dst[2*i] = hextable[b>>4]
+ dst[2*i+1] = hextable[b&0x0f]
+ }
+ return dst
+}
+
+func decodeScalarForTest(src []byte) []byte {
+ dst := make([]byte, DecodedLen(len(src)))
+ for i := 0; i < len(src)-1; i += 2 {
+ dst[i/2] = (reverseHexTable[src[i]] << 4) | reverseHexTable[src[i+1]]
+ }
+ return dst
+}
diff --git a/src/encoding/hex/hex_simd_wasm.go b/src/encoding/hex/hex_simd_wasm.go
new file mode 100644
index 0000000..7e76f71
--- /dev/null
+++ b/src/encoding/hex/hex_simd_wasm.go
@@ -0,0 +1,102 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && wasm
+
+package hex
+
+import (
+ "internal/byteorder"
+ "simd/archsimd"
+)
+
+const simdBlockSize = 16
+
+var hexTableVector = archsimd.LoadUint8x16([]byte(hextable)).BitsToInt8()
+var decodePackMask = archsimd.LoadInt8x16([]int8{
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+})
+
+const lowNibbleMask = 0x0f
+const lowByteMask = 0x00ff
+const lowLetter = 0x20 // 'A'...'F' -> 'a'...'f'.
+const digitRangeStart = byte('0')
+const digitRangeSize = byte('9') - digitRangeStart + 1
+const letterRangeStart = byte('a')
+const letterRangeSize = byte('f') - letterRangeStart + 1
+const letterNibbleOffset = 9
+
+// encodeSIMD encodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func encodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ encodedLo, encodedHi := encodeBlock(archsimd.LoadUint8x16(src))
+ encodedLo.Store(dst[:simdBlockSize])
+ encodedHi.Store(dst[simdBlockSize : 2*simdBlockSize])
+
+ src = src[simdBlockSize:]
+ dst = dst[2*simdBlockSize:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// encodeBlock encodes one vector into two consecutive vectors of hexadecimal
+// digits.
+func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
+ lowNibbles := x.And(archsimd.BroadcastUint8x16(lowNibbleMask))
+ highNibbles := x.ShiftAllRight(4)
+
+ lowDigits := hexTableVector.LookupOrZero(lowNibbles.BitsToInt8()).ToBits()
+ highDigits := hexTableVector.LookupOrZero(highNibbles.BitsToInt8()).ToBits()
+
+ encodedLo = highDigits.ExtendLo8ToUint16().
+ Or(lowDigits.ExtendLo8ToUint16().ShiftAllLeft(8)).
+ ReshapeToUint8s()
+ encodedHi = highDigits.ExtendHi8ToUint16().
+ Or(lowDigits.ExtendHi8ToUint16().ShiftAllLeft(8)).
+ ReshapeToUint8s()
+ return encodedLo, encodedHi
+}
+
+// decodeSIMD decodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func decodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ if invalid := decodeBlock(archsimd.LoadUint8x16(src), dst); invalid {
+ return processed
+ }
+ src = src[simdBlockSize:]
+ dst = dst[simdBlockSize/2:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// decodeBlock decodes one vector of hexadecimal digits into eight bytes.
+// It returns whether the input contained an invalid byte.
+func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
+ lower := input.Or(archsimd.BroadcastUint8x16(lowLetter)) // 'A'...'F' -> 'a'...'f'.
+ isDigit := input.Sub(archsimd.BroadcastUint8x16(digitRangeStart)).
+ Less(archsimd.BroadcastUint8x16(digitRangeSize))
+ isLetter := lower.Sub(archsimd.BroadcastUint8x16(letterRangeStart)).
+ Less(archsimd.BroadcastUint8x16(letterRangeSize))
+ isValid := isDigit.Or(isLetter)
+
+ validBits := isValid.ToInt8x16().ToBits().ReshapeToUint64s()
+ if validBits.GetElem(0) != ^uint64(0) || validBits.GetElem(1) != ^uint64(0) {
+ return true
+ }
+
+ nibble := input.And(archsimd.BroadcastUint8x16(lowNibbleMask)).
+ Add(isLetter.ToInt8x16().ToBits().And(archsimd.BroadcastUint8x16(letterNibbleOffset))).BitsToInt8()
+
+ words := nibble.ToBits().ReshapeToUint16s()
+ packed := words.And(archsimd.BroadcastUint16x8(lowByteMask)).ShiftAllLeft(4).
+ Or(words.ShiftAllRight(8)).ReshapeToUint8s().BitsToInt8().
+ LookupOrZero(decodePackMask).ToBits().ReshapeToUint64s()
+ byteorder.LEPutUint64(dst, packed.GetElem(0))
+ return false
+}
diff --git a/src/encoding/hex/hex_test.go b/src/encoding/hex/hex_test.go
index f90dec5..89f2c41 100644
--- a/src/encoding/hex/hex_test.go
+++ b/src/encoding/hex/hex_test.go
@@ -248,8 +248,9 @@
var sink []byte

func BenchmarkEncode(b *testing.B) {
- for _, size := range []int{256, 1024, 4096, 16384} {
- src := bytes.Repeat([]byte{2, 3, 5, 7, 9, 11, 13, 17}, size/8)
+ for _, size := range []int{8, 15, 16, 17, 31, 32, 33, 64, 256, 1024, 4096, 16384, 1 << 20} {
+ pattern := []byte{2, 3, 5, 7, 9, 11, 13, 17}
+ src := bytes.Repeat(pattern, (size+len(pattern)-1)/len(pattern))[:size]
sink = make([]byte, 2*size)

b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
@@ -262,8 +263,9 @@
}

func BenchmarkDecode(b *testing.B) {
- for _, size := range []int{256, 1024, 4096, 16384} {
- src := bytes.Repeat([]byte{'2', 'b', '7', '4', '4', 'f', 'a', 'a'}, size/8)
+ for _, size := range []int{8, 14, 16, 18, 30, 32, 34, 64, 256, 1024, 4096, 16384, 1 << 20} {
+ pattern := []byte("2B74fAa0")
+ src := bytes.Repeat(pattern, (size+len(pattern)-1)/len(pattern))[:size]
sink = make([]byte, size/2)

b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
index 4959a42..41d6d7e 100644
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -268,9 +268,13 @@

FMT, encoding, encoding/base32, encoding/base64, encoding/binary,
internal/saferio
- < encoding/ascii85, encoding/csv, encoding/gob, encoding/hex,
+ < encoding/ascii85, encoding/csv, encoding/gob,
encoding/pem, encoding/xml, mime;

+ FMT, encoding, encoding/base32, encoding/base64, encoding/binary,
+ internal/saferio, simd/archsimd
+ < encoding/hex;
+
STR, errors
< encoding/json/internal
< encoding/json/internal/jsonflags

Change information

Files:
  • M src/encoding/hex/hex.go
  • A src/encoding/hex/hex_fallback.go
  • A src/encoding/hex/hex_simd_amd64.go
  • A src/encoding/hex/hex_simd_arm64.go
  • A src/encoding/hex/hex_simd_test.go
  • A src/encoding/hex/hex_simd_wasm.go
  • M src/encoding/hex/hex_test.go
  • M src/go/build/deps_test.go
Change size: L
Delta: 8 files changed, 746 insertions(+), 8 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: Ic21fa6f0f864d825cfcf45e3d5c7a3bcbdc6e77f
Gerrit-Change-Number: 795820
Gerrit-PatchSet: 1
Gerrit-Owner: Илья <il.t...@gmail.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

Gopher Robot (Gerrit)

unread,
Jun 30, 2026, 4:19:43 PM (8 hours ago) Jun 30
to Илья, goph...@pubsubhelper.golang.org, Jorropo, golang-co...@googlegroups.com
Attention needed from Jorropo

Message from Gopher Robot

Congratulations on opening your first change. Thank you for your contribution!

Next steps:
A maintainer will review your change and provide feedback. See
https://go.dev/doc/contribute#review for more info and tips to get your
patch through code review.

Most changes in the Go project go through a few rounds of revision. This can be
surprising to people new to the project. The careful, iterative review process
is our way of helping mentor contributors and ensuring that their contributions
have a lasting impact.

During May-July and Nov-Jan the Go project is in a code freeze, during which
little code gets reviewed or merged. If a reviewer responds with a comment like
R=go1.11 or adds a tag like "wait-release", it means that this CL will be
reviewed as part of the next development cycle. See https://go.dev/s/release
for more details.

Open in Gerrit

Related details

Attention is currently required from:
  • Jorropo
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: comment
Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: Ic21fa6f0f864d825cfcf45e3d5c7a3bcbdc6e77f
Gerrit-Change-Number: 795820
Gerrit-PatchSet: 1
Gerrit-Owner: Илья <il.t...@gmail.com>
Gerrit-Reviewer: Jorropo <jorro...@gmail.com>
Gerrit-CC: Gopher Robot <go...@golang.org>
Gerrit-Attention: Jorropo <jorro...@gmail.com>
Gerrit-Comment-Date: Tue, 30 Jun 2026 20:19:37 +0000
Gerrit-HasComments: No
Gerrit-Has-Labels: No
unsatisfied_requirement
satisfied_requirement
open
diffy

Jorropo (Gerrit)

unread,
Jun 30, 2026, 11:56:02 PM (15 minutes ago) Jun 30
to Илья, goph...@pubsubhelper.golang.org, Gopher Robot, golang-co...@googlegroups.com
Attention needed from Илья

Jorropo voted and added 14 comments

Votes added by Jorropo

Commit-Queue+1

14 comments

Patchset-level comments
File-level comment, Patchset 1 (Latest):
Jorropo . unresolved

Hey thanks, this is great, I only skimmed the AMD64 implementation.
I'll take a seriously look later.

Commit Message
Line 9, Patchset 1 (Latest):Change-Id: Ic21fa6f0f864d825cfcf45e3d5c7a3bcbdc6e77f
Jorropo . unresolved

Please split the commits into 3, one per architecture.

Given you are not using github, "all" you need to do is make 3 commits on one branch and push your whole branch at once.
Gerrit will make 3 stacked CLs.

File src/encoding/hex/hex_simd_amd64.go
Line 35, Patchset 1 (Latest):var (
lowNibbleMask128 = [simdBlockSize]byte{

0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
}
lowNibbleMask256 = [avx2BlockSize]byte{

0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
}
Jorropo . unresolved

Broadcast ?


lowByteMask128 = [simdBlockSize / 2]uint16{
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
}
lowByteMask256 = [avx2BlockSize / 2]uint16{
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
}
Jorropo . unresolved

Wouldn't it make more sense to broadcast `0x00ff` ?

Line 62, Patchset 1 (Latest): }
decodePairWeights128 = [simdBlockSize]int8{

16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
}
decodePairWeights256 = [avx2BlockSize]int8{

16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
}
Jorropo . unresolved

This could you do an x16 broadcast and then convert it to an x8 (there is no actually instruction to convert x16 → x8 it's free).

Line 72, Patchset 1 (Latest): }
letterNibbleOffset128 = [simdBlockSize]byte{

9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
}
letterNibbleOffset256 = [avx2BlockSize]byte{

9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
}
Jorropo . unresolved

Broadcast ?

Line 82, Patchset 1 (Latest): }
digitRangeBefore128 = [simdBlockSize]int8{

0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
}
digitRangeBefore256 = [avx2BlockSize]int8{

0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
}
Jorropo . unresolved

Broadcast ?

Line 93, Patchset 1 (Latest): digitRangeAfter128 = [simdBlockSize]int8{

0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
}
digitRangeAfter256 = [avx2BlockSize]int8{

0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
}
Jorropo . unresolved

Broadcast ?

Line 103, Patchset 1 (Latest): lowerLetterRangeBefore128 = [simdBlockSize]int8{

0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
}
lowerLetterRangeBefore256 = [avx2BlockSize]int8{

0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
}
Jorropo . unresolved

Broadcast ?

Line 113, Patchset 1 (Latest): lowerLetterRangeAfter128 = [simdBlockSize]int8{

0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
}
lowerLetterRangeAfter256 = [avx2BlockSize]int8{

0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
}
Jorropo . unresolved

Broadcast ?

Line 123, Patchset 1 (Latest): normalizeLetterValue128 = [simdBlockSize]int8{

0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
}
normalizeLetterValue256 = [avx2BlockSize]int8{

0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
}
Jorropo . unresolved

Broadcast ?

Line 172, Patchset 1 (Latest):// encodeBlock encodes one 128-bit vector into two consecutive vectors of

// hexadecimal digits using AVX instructions.
func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
nibbleMask := archsimd.LoadUint8x16Array(&lowNibbleMask128)
lowNibbles := x.And(nibbleMask)
highNibbles := x.AsUint16x8().ShiftAllRight(4).AsUint8x16().And(nibbleMask)

hexTable := archsimd.LoadUint8x16Array(&hexTable128)
lowDigits := hexTable.PermuteOrZero(lowNibbles.AsInt8x16())
highDigits := hexTable.PermuteOrZero(highNibbles.AsInt8x16())

lowByteMask := archsimd.LoadUint16x8Array(&lowByteMask128)
highWords := highDigits.AsUint16x8()
lowWords := lowDigits.AsUint16x8()
evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))

return evenPairs.InterleaveLo(oddPairs).AsUint8x16(), evenPairs.InterleaveHi(oddPairs).AsUint8x16()

}

// encodeBlockAVX2 encodes one 256-bit vector into two consecutive vectors of
// hexadecimal digits.

func encodeBlockAVX2(x archsimd.Uint8x32) (encodedLo, encodedHi archsimd.Uint8x32) {
nibbleMask := archsimd.LoadUint8x32Array(&lowNibbleMask256)
lowNibbles := x.And(nibbleMask)
highNibbles := x.AsUint16x16().ShiftAllRight(4).AsUint8x32().And(nibbleMask)

hexTable := archsimd.LoadUint8x32Array(&hexTable256)
lowDigits := hexTable.PermuteOrZeroGrouped(lowNibbles.AsInt8x32())
highDigits := hexTable.PermuteOrZeroGrouped(highNibbles.AsInt8x32())

lowByteMask := archsimd.LoadUint16x16Array(&lowByteMask256)
highWords := highDigits.AsUint16x16()
lowWords := lowDigits.AsUint16x16()
evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))

interleavedLo := evenPairs.InterleaveLoGrouped(oddPairs)
interleavedHi := evenPairs.InterleaveHiGrouped(oddPairs)

encodedLo = interleavedLo.ConcatPermute128Scalars(0, 2, interleavedHi).AsUint8x32()
encodedHi = interleavedLo.ConcatPermute128Scalars(1, 3, interleavedHi).AsUint8x32()
return encodedLo, encodedHi
}
Jorropo . unresolved

I'm not sure why theses need to be a function, at first glance (I didn't checked) they look too long to be inlined, and it's not good to have a function call in a hotloop.

Also you could prepare stuff like `nibbleMask` `hexTable`, ... before the loop once and just keep it in registers.

Line 254, Patchset 1 (Latest):// decodeBlock decodes one 128-bit vector of hexadecimal digits into eight bytes.

// It returns whether the input contained an invalid byte.
func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
c := input.BitsToInt8()
digitRangeBefore := archsimd.LoadInt8x16Array(&digitRangeBefore128)
digitRangeAfter := archsimd.LoadInt8x16Array(&digitRangeAfter128)
lowerLetterRangeBefore := archsimd.LoadInt8x16Array(&lowerLetterRangeBefore128)
lowerLetterRangeAfter := archsimd.LoadInt8x16Array(&lowerLetterRangeAfter128)
lower := input.Or(archsimd.LoadInt8x16Array(&normalizeLetterValue128).AsUint8x16())
lowerC := lower.BitsToInt8()


// Use strict bounds because signed Greater maps directly to VPCMPGTB,
// while inclusive comparisons require multiple instructions.
isDigit := c.Greater(digitRangeBefore).
And(digitRangeAfter.Greater(c))
isLetter := lowerC.Greater(lowerLetterRangeBefore).
And(lowerLetterRangeAfter.Greater(lowerC))
isValid := isDigit.Or(isLetter)

if isValid.ToBits() != 0xffff {
return true
}

nibble := input.And(archsimd.LoadUint8x16Array(&lowNibbleMask128)).
Add(isLetter.ToInt8x16().ToBits().And(archsimd.LoadUint8x16Array(&letterNibbleOffset128)))

decodeNibbles(nibble, dst)
return false

}

// decodeBlockAVX2 decodes one 256-bit vector of hexadecimal digits into
// sixteen bytes using AVX2 instructions.
// It returns whether the input contained an invalid byte.
func decodeBlockAVX2(input archsimd.Uint8x32, dst []byte) (invalid bool) {
c := input.BitsToInt8()
lower := input.Or(archsimd.LoadInt8x32Array(&normalizeLetterValue256).AsUint8x32())
lowerC := lower.BitsToInt8()

// See the 128-bit path for why these use strict bounds.
isDigit := c.Greater(archsimd.LoadInt8x32Array(&digitRangeBefore256)).
And(archsimd.LoadInt8x32Array(&digitRangeAfter256).Greater(c))
isLetter := lowerC.Greater(archsimd.LoadInt8x32Array(&lowerLetterRangeBefore256)).
And(archsimd.LoadInt8x32Array(&lowerLetterRangeAfter256).Greater(lowerC))
isValid := isDigit.Or(isLetter)

if isValid.ToBits() != 0xffffffff {
return true
}

nibble := input.And(archsimd.LoadUint8x32Array(&lowNibbleMask256)).
Add(isLetter.ToInt8x32().ToBits().And(archsimd.LoadUint8x32Array(&letterNibbleOffset256)))

packed := nibble.DotProductPairsSaturated(archsimd.LoadInt8x32Array(&decodePairWeights256)).
ToBits().AsUint8x32().
PermuteOrZeroGrouped(archsimd.LoadInt8x32Array(&decodePackMask256))
decoded := packed.GetLo().AsUint64x2().
ConcatPermuteScalars(0, 2, packed.GetHi().AsUint64x2()).AsUint8x16()
decoded.Store(dst[:16])
return false
}
Jorropo . unresolved

I'm not sure why theses need to be a function, at first glance (I didn't checked) they look too long to be inlined, and it's not good to have a function call in a hotloop.

Also you could prepare stuff like `nibbleMask` `hexTable`, ... before the loop once and just keep it in registers.

Open in Gerrit

Related details

Attention is currently required from:
  • Илья
Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement is not satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement is not satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: comment
    Gerrit-Project: go
    Gerrit-Branch: master
    Gerrit-Change-Id: Ic21fa6f0f864d825cfcf45e3d5c7a3bcbdc6e77f
    Gerrit-Change-Number: 795820
    Gerrit-PatchSet: 1
    Gerrit-Owner: Илья <il.t...@gmail.com>
    Gerrit-Reviewer: Jorropo <jorro...@gmail.com>
    Gerrit-CC: Gopher Robot <go...@golang.org>
    Gerrit-Attention: Илья <il.t...@gmail.com>
    Gerrit-Comment-Date: Wed, 01 Jul 2026 03:55:54 +0000
    Gerrit-HasComments: Yes
    Gerrit-Has-Labels: Yes
    unsatisfied_requirement
    open
    diffy
    Reply all
    Reply to author
    Forward
    0 new messages