encoding/hex: add SIMD encoding and decoding
diff --git a/src/encoding/hex/hex.go b/src/encoding/hex/hex.go
index 4499974..838758b 100644
--- a/src/encoding/hex/hex.go
+++ b/src/encoding/hex/hex.go
@@ -43,8 +43,9 @@
// of bytes written to dst, but this value is always [EncodedLen](len(src)).
// Encode implements hexadecimal encoding.
func Encode(dst, src []byte) int {
- j := 0
- for _, v := range src {
+ processed := encodeSIMD(dst, src)
+ j := processed * 2
+ for _, v := range src[processed:] {
dst[j] = hextable[v>>4]
dst[j+1] = hextable[v&0x0f]
j += 2
@@ -85,7 +86,8 @@
// If the input is malformed, Decode returns the number
// of bytes decoded before the error.
func Decode(dst, src []byte) (int, error) {
- i, j := 0, 0
+ processed := decodeSIMD(dst, src)
+ i, j := processed/2, processed
for ; j < len(src)-1; j += 2 {
p := src[j]
q := src[j+1]
diff --git a/src/encoding/hex/hex_fallback.go b/src/encoding/hex/hex_fallback.go
new file mode 100644
index 0000000..52e5371
--- /dev/null
+++ b/src/encoding/hex/hex_fallback.go
@@ -0,0 +1,15 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.simd || (!arm64 && !amd64 && !wasm)
+
+package hex
+
+func encodeSIMD(dst, src []byte) int {
+ return 0
+}
+
+func decodeSIMD(dst, src []byte) int {
+ return 0
+}
diff --git a/src/encoding/hex/hex_simd_amd64.go b/src/encoding/hex/hex_simd_amd64.go
new file mode 100644
index 0000000..8b4f95c
--- /dev/null
+++ b/src/encoding/hex/hex_simd_amd64.go
@@ -0,0 +1,320 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package hex
+
+import (
+ "internal/byteorder"
+ "simd/archsimd"
+)
+
+const (
+ simdBlockSize = 16
+ avx2BlockSize = 32
+)
+
+var (
+ hexTable128 = [simdBlockSize]byte{
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ }
+ // AVX2 byte permutations operate independently on each 128-bit lane.
+ hexTable256 = [avx2BlockSize]byte{
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ }
+)
+
+// Loading constants from memory keeps the 128-bit path compatible with AVX-only
+// CPUs; the corresponding archsimd broadcast operations require AVX2.
+var (
+ lowNibbleMask128 = [simdBlockSize]byte{
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ }
+ lowNibbleMask256 = [avx2BlockSize]byte{
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ }
+ lowByteMask128 = [simdBlockSize / 2]uint16{
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ }
+ lowByteMask256 = [avx2BlockSize / 2]uint16{
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+ }
+ decodePackMask128 = [simdBlockSize]int8{
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }
+ decodePackMask256 = [avx2BlockSize]int8{
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }
+ decodePairWeights128 = [simdBlockSize]int8{
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ }
+ decodePairWeights256 = [avx2BlockSize]int8{
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ 16, 1, 16, 1, 16, 1, 16, 1,
+ }
+ letterNibbleOffset128 = [simdBlockSize]byte{
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ }
+ letterNibbleOffset256 = [avx2BlockSize]byte{
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9,
+ }
+ digitRangeBefore128 = [simdBlockSize]int8{
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ }
+ digitRangeBefore256 = [avx2BlockSize]int8{
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
+ }
+ digitRangeAfter128 = [simdBlockSize]int8{
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ }
+ digitRangeAfter256 = [avx2BlockSize]int8{
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+ }
+ lowerLetterRangeBefore128 = [simdBlockSize]int8{
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ }
+ lowerLetterRangeBefore256 = [avx2BlockSize]int8{
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
+ }
+ lowerLetterRangeAfter128 = [simdBlockSize]int8{
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ }
+ lowerLetterRangeAfter256 = [avx2BlockSize]int8{
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
+ }
+ normalizeLetterValue128 = [simdBlockSize]int8{
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ }
+ normalizeLetterValue256 = [avx2BlockSize]int8{
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ }
+)
+
+// encodeSIMD encodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func encodeSIMD(dst, src []byte) int {
+ if len(src) < simdBlockSize || !archsimd.X86.AVX() {
+ return 0
+ }
+
+ processed := 0
+ useAVX2 := archsimd.X86.AVX2() && len(src) >= avx2BlockSize
+ if useAVX2 {
+ for len(src) >= avx2BlockSize {
+ encodedLo, encodedHi := encodeBlockAVX2(archsimd.LoadUint8x32(src))
+ encodedLo.Store(dst[:avx2BlockSize])
+ encodedHi.Store(dst[avx2BlockSize : 2*avx2BlockSize])
+
+ src = src[avx2BlockSize:]
+ dst = dst[2*avx2BlockSize:]
+ processed += avx2BlockSize
+ }
+ }
+
+ for len(src) >= simdBlockSize {
+ encodedLo, encodedHi := encodeBlock(archsimd.LoadUint8x16(src))
+ encodedLo.Store(dst[:simdBlockSize])
+ encodedHi.Store(dst[simdBlockSize : 2*simdBlockSize])
+
+ src = src[simdBlockSize:]
+ dst = dst[2*simdBlockSize:]
+ processed += simdBlockSize
+ }
+
+ if useAVX2 {
+ archsimd.ClearAVXUpperBits()
+ }
+ return processed
+}
+
+// encodeBlock encodes one 128-bit vector into two consecutive vectors of
+// hexadecimal digits using AVX instructions.
+func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
+ nibbleMask := archsimd.LoadUint8x16Array(&lowNibbleMask128)
+ lowNibbles := x.And(nibbleMask)
+ highNibbles := x.AsUint16x8().ShiftAllRight(4).AsUint8x16().And(nibbleMask)
+
+ hexTable := archsimd.LoadUint8x16Array(&hexTable128)
+ lowDigits := hexTable.PermuteOrZero(lowNibbles.AsInt8x16())
+ highDigits := hexTable.PermuteOrZero(highNibbles.AsInt8x16())
+
+ lowByteMask := archsimd.LoadUint16x8Array(&lowByteMask128)
+ highWords := highDigits.AsUint16x8()
+ lowWords := lowDigits.AsUint16x8()
+ evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
+ oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))
+
+ return evenPairs.InterleaveLo(oddPairs).AsUint8x16(), evenPairs.InterleaveHi(oddPairs).AsUint8x16()
+}
+
+// encodeBlockAVX2 encodes one 256-bit vector into two consecutive vectors of
+// hexadecimal digits.
+func encodeBlockAVX2(x archsimd.Uint8x32) (encodedLo, encodedHi archsimd.Uint8x32) {
+ nibbleMask := archsimd.LoadUint8x32Array(&lowNibbleMask256)
+ lowNibbles := x.And(nibbleMask)
+ highNibbles := x.AsUint16x16().ShiftAllRight(4).AsUint8x32().And(nibbleMask)
+
+ hexTable := archsimd.LoadUint8x32Array(&hexTable256)
+ lowDigits := hexTable.PermuteOrZeroGrouped(lowNibbles.AsInt8x32())
+ highDigits := hexTable.PermuteOrZeroGrouped(highNibbles.AsInt8x32())
+
+ lowByteMask := archsimd.LoadUint16x16Array(&lowByteMask256)
+ highWords := highDigits.AsUint16x16()
+ lowWords := lowDigits.AsUint16x16()
+ evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
+ oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))
+
+ interleavedLo := evenPairs.InterleaveLoGrouped(oddPairs)
+ interleavedHi := evenPairs.InterleaveHiGrouped(oddPairs)
+ encodedLo = interleavedLo.ConcatPermute128Scalars(0, 2, interleavedHi).AsUint8x32()
+ encodedHi = interleavedLo.ConcatPermute128Scalars(1, 3, interleavedHi).AsUint8x32()
+ return encodedLo, encodedHi
+}
+
+// decodeSIMD decodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func decodeSIMD(dst, src []byte) (processed int) {
+ if len(src) < simdBlockSize || !archsimd.X86.AVX() {
+ return 0
+ }
+
+ useAVX2 := archsimd.X86.AVX2() && len(src) >= avx2BlockSize
+ if useAVX2 {
+ for len(src) >= avx2BlockSize {
+ if invalid := decodeBlockAVX2(archsimd.LoadUint8x32(src), dst); invalid {
+ archsimd.ClearAVXUpperBits()
+ return processed
+ }
+ src = src[avx2BlockSize:]
+ dst = dst[avx2BlockSize/2:]
+ processed += avx2BlockSize
+ }
+ }
+
+ for len(src) >= simdBlockSize {
+ if invalid := decodeBlock(archsimd.LoadUint8x16(src), dst); invalid {
+ if useAVX2 {
+ archsimd.ClearAVXUpperBits()
+ }
+ return processed
+ }
+ src = src[simdBlockSize:]
+ dst = dst[simdBlockSize/2:]
+ processed += simdBlockSize
+ }
+
+ if useAVX2 {
+ archsimd.ClearAVXUpperBits()
+ }
+ return processed
+}
+
+// decodeBlock decodes one 128-bit vector of hexadecimal digits into eight bytes.
+// It returns whether the input contained an invalid byte.
+func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
+ c := input.BitsToInt8()
+ digitRangeBefore := archsimd.LoadInt8x16Array(&digitRangeBefore128)
+ digitRangeAfter := archsimd.LoadInt8x16Array(&digitRangeAfter128)
+ lowerLetterRangeBefore := archsimd.LoadInt8x16Array(&lowerLetterRangeBefore128)
+ lowerLetterRangeAfter := archsimd.LoadInt8x16Array(&lowerLetterRangeAfter128)
+ lower := input.Or(archsimd.LoadInt8x16Array(&normalizeLetterValue128).AsUint8x16())
+ lowerC := lower.BitsToInt8()
+
+ // Use strict bounds because signed Greater maps directly to VPCMPGTB,
+ // while inclusive comparisons require multiple instructions.
+ isDigit := c.Greater(digitRangeBefore).
+ And(digitRangeAfter.Greater(c))
+ isLetter := lowerC.Greater(lowerLetterRangeBefore).
+ And(lowerLetterRangeAfter.Greater(lowerC))
+ isValid := isDigit.Or(isLetter)
+
+ if isValid.ToBits() != 0xffff {
+ return true
+ }
+
+ nibble := input.And(archsimd.LoadUint8x16Array(&lowNibbleMask128)).
+ Add(isLetter.ToInt8x16().ToBits().And(archsimd.LoadUint8x16Array(&letterNibbleOffset128)))
+
+ decodeNibbles(nibble, dst)
+ return false
+}
+
+// decodeBlockAVX2 decodes one 256-bit vector of hexadecimal digits into
+// sixteen bytes using AVX2 instructions.
+// It returns whether the input contained an invalid byte.
+func decodeBlockAVX2(input archsimd.Uint8x32, dst []byte) (invalid bool) {
+ c := input.BitsToInt8()
+ lower := input.Or(archsimd.LoadInt8x32Array(&normalizeLetterValue256).AsUint8x32())
+ lowerC := lower.BitsToInt8()
+ // See the 128-bit path for why these use strict bounds.
+ isDigit := c.Greater(archsimd.LoadInt8x32Array(&digitRangeBefore256)).
+ And(archsimd.LoadInt8x32Array(&digitRangeAfter256).Greater(c))
+ isLetter := lowerC.Greater(archsimd.LoadInt8x32Array(&lowerLetterRangeBefore256)).
+ And(archsimd.LoadInt8x32Array(&lowerLetterRangeAfter256).Greater(lowerC))
+ isValid := isDigit.Or(isLetter)
+
+ if isValid.ToBits() != 0xffffffff {
+ return true
+ }
+
+ nibble := input.And(archsimd.LoadUint8x32Array(&lowNibbleMask256)).
+ Add(isLetter.ToInt8x32().ToBits().And(archsimd.LoadUint8x32Array(&letterNibbleOffset256)))
+
+ packed := nibble.DotProductPairsSaturated(archsimd.LoadInt8x32Array(&decodePairWeights256)).
+ ToBits().AsUint8x32().
+ PermuteOrZeroGrouped(archsimd.LoadInt8x32Array(&decodePackMask256))
+ decoded := packed.GetLo().AsUint64x2().
+ ConcatPermuteScalars(0, 2, packed.GetHi().AsUint64x2()).AsUint8x16()
+ decoded.Store(dst[:16])
+ return false
+}
+
+// decodeNibbles combines 16 nibble values into 8 decoded bytes.
+func decodeNibbles(nibble archsimd.Uint8x16, dst []byte) {
+ packed := nibble.DotProductPairsSaturated(archsimd.LoadInt8x16Array(&decodePairWeights128)).
+ ToBits().AsUint8x16().
+ PermuteOrZero(archsimd.LoadInt8x16Array(&decodePackMask128))
+ byteorder.LEPutUint64(dst, packed.AsUint64x2().GetElem(0))
+}
diff --git a/src/encoding/hex/hex_simd_arm64.go b/src/encoding/hex/hex_simd_arm64.go
new file mode 100644
index 0000000..a7d8d93
--- /dev/null
+++ b/src/encoding/hex/hex_simd_arm64.go
@@ -0,0 +1,89 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && arm64
+
+package hex
+
+import (
+ "internal/byteorder"
+ "simd/archsimd"
+)
+
+const simdBlockSize = 16
+
+var hexTableVector = archsimd.LoadUint8x16([]byte(hextable))
+
+const lowNibbleMask = 0x0f
+const lowLetter = 0x20 // 'A'...'F' -> 'a'...'f'.
+const digitRangeStart = byte('0')
+const digitRangeSize = byte('9') - digitRangeStart + 1
+const letterRangeStart = byte('a')
+const letterRangeSize = byte('f') - letterRangeStart + 1
+const letterNibbleOffset = 9
+
+// encodeSIMD encodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func encodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ encodedLo, encodedHi := encodeBlock(archsimd.LoadUint8x16(src))
+ encodedLo.Store(dst[:simdBlockSize])
+ encodedHi.Store(dst[simdBlockSize : 2*simdBlockSize])
+
+ src = src[simdBlockSize:]
+ dst = dst[2*simdBlockSize:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// encodeBlock encodes one vector into two consecutive vectors of hexadecimal
+// digits.
+func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
+ lowNibbles := x.And(archsimd.BroadcastUint8x16(lowNibbleMask))
+ highNibbles := x.ShiftAllRight(4)
+
+ lowDigits := hexTableVector.LookupOrZero(lowNibbles)
+ highDigits := hexTableVector.LookupOrZero(highNibbles)
+
+ return highDigits.InterleaveLo(lowDigits), highDigits.InterleaveHi(lowDigits)
+}
+
+// decodeSIMD decodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func decodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ if invalid := decodeBlock(archsimd.LoadUint8x16(src), dst); invalid {
+ return processed
+ }
+ src = src[simdBlockSize:]
+ dst = dst[simdBlockSize/2:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// decodeBlock decodes one vector of hexadecimal digits into eight bytes.
+// It returns whether the input contained an invalid byte.
+func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
+ lower := input.Or(archsimd.BroadcastUint8x16(lowLetter)) // 'A'...'F' -> 'a'...'f'.
+ isDigit := input.Sub(archsimd.BroadcastUint8x16(digitRangeStart)).
+ Less(archsimd.BroadcastUint8x16(digitRangeSize))
+ isLetter := lower.Sub(archsimd.BroadcastUint8x16(letterRangeStart)).
+ Less(archsimd.BroadcastUint8x16(letterRangeSize))
+ isValid := isDigit.Or(isLetter)
+
+ if isValid.ToInt8x16().ToBits().ReduceMin() == 0 {
+ return true
+ }
+
+ nibble := input.And(archsimd.BroadcastUint8x16(lowNibbleMask)).
+ Add(isLetter.ToInt8x16().ToBits().And(archsimd.BroadcastUint8x16(letterNibbleOffset))).BitsToInt8()
+
+ evenNibbles := nibble.ConcatEven(nibble)
+ oddNibbles := nibble.ConcatOdd(nibble)
+ result := evenNibbles.ShiftAllLeft(4).Or(oddNibbles).ToBits().ReshapeToUint64s()
+ byteorder.LEPutUint64(dst, result.GetElem(0))
+ return false
+}
diff --git a/src/encoding/hex/hex_simd_test.go b/src/encoding/hex/hex_simd_test.go
new file mode 100644
index 0000000..d67af3f
--- /dev/null
+++ b/src/encoding/hex/hex_simd_test.go
@@ -0,0 +1,204 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && (arm64 || amd64 || wasm)
+
+package hex
+
+import (
+ "bytes"
+ "strconv"
+ "testing"
+)
+
+func TestEncodeSIMDBoundaries(t *testing.T) {
+ src := make([]byte, 257)
+ for i := range src {
+ src[i] = byte(i)
+ }
+
+ sizes := []int{
+ 0, 1,
+ 15, 16, 17,
+ 31, 32, 33,
+ 47, 48, 49,
+ 63, 64, 65,
+ 127, 128, 129,
+ 255, 256, 257,
+ }
+
+ const guardSize = 16
+ wantGuard := bytes.Repeat([]byte{0xa5}, guardSize)
+ for _, size := range sizes {
+ t.Run(strconv.Itoa(size), func(t *testing.T) {
+ encodedLen := EncodedLen(size)
+ backing := bytes.Repeat([]byte{0xa5}, encodedLen+2*guardSize)
+ dst := backing[guardSize : guardSize+encodedLen]
+
+ if got := Encode(dst, src[:size]); got != encodedLen {
+ t.Fatalf("Encode returned %d, want %d", got, encodedLen)
+ }
+ if want := encodeScalarForTest(src[:size]); !bytes.Equal(dst, want) {
+ t.Fatalf("Encode returned %q, want %q", dst, want)
+ }
+ if !bytes.Equal(backing[:guardSize], wantGuard) {
+ t.Fatal("Encode wrote before dst")
+ }
+ if !bytes.Equal(backing[guardSize+encodedLen:], wantGuard) {
+ t.Fatal("Encode wrote past dst")
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDBoundaries(t *testing.T) {
+ sizes := []int{
+ 0, 2,
+ 14, 16, 18,
+ 30, 32, 34,
+ 46, 48, 50,
+ 62, 64, 66,
+ 126, 128, 130,
+ 254, 256, 258,
+ }
+
+ const guardSize = 16
+ wantGuard := bytes.Repeat([]byte{0xa5}, guardSize)
+ for _, size := range sizes {
+ t.Run(strconv.Itoa(size), func(t *testing.T) {
+ const hexChars = "0123456789abcdefABCDEF"
+ src := make([]byte, size)
+ for i := range src {
+ src[i] = hexChars[i%len(hexChars)]
+ }
+ decodedLen := DecodedLen(size)
+
+ backing := bytes.Repeat([]byte{0xa5}, decodedLen+2*guardSize)
+ dst := backing[guardSize : guardSize+decodedLen]
+
+ n, err := Decode(dst, src)
+ if err != nil {
+ t.Fatalf("Decode returned error: %v", err)
+ }
+ if n != decodedLen {
+ t.Fatalf("Decode returned %d, want %d", n, decodedLen)
+ }
+ if want := decodeScalarForTest(src); !bytes.Equal(dst, want) {
+ t.Fatalf("Decode returned %q, want %q", dst, want)
+ }
+ if !bytes.Equal(backing[:guardSize], wantGuard) {
+ t.Fatal("Decode wrote before dst")
+ }
+ if !bytes.Equal(backing[guardSize+decodedLen:], wantGuard) {
+ t.Fatal("Decode wrote past dst")
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDOddLength(t *testing.T) {
+ for _, size := range []int{1, 15, 17, 31, 33, 47, 49} {
+ t.Run(strconv.Itoa(size), func(t *testing.T) {
+ src := bytes.Repeat([]byte{'0'}, size)
+ dst := bytes.Repeat([]byte{0xa5}, DecodedLen(size))
+
+ n, err := Decode(dst, src)
+ if err != ErrLength {
+ t.Fatalf("Decode returned error %v, want ErrLength", err)
+ }
+ if n != DecodedLen(size) {
+ t.Fatalf("Decode returned %d, want %d", n, DecodedLen(size))
+ }
+ if want := make([]byte, n); !bytes.Equal(dst, want) {
+ t.Fatalf("Decode returned %q, want %q", dst, want)
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDErrors(t *testing.T) {
+ tests := []struct {
+ size int
+ pos int
+ }{
+ {32, 0},
+ {32, 1},
+ {32, 2},
+ {32, 14},
+ {32, 15},
+ {32, 16},
+ {32, 17},
+ {32, 18},
+ {32, 30},
+ {32, 31},
+ {33, 32},
+ }
+ for _, test := range tests {
+ name := strconv.Itoa(test.pos) + "/" + strconv.Itoa(test.size)
+ t.Run(name, func(t *testing.T) {
+ src := bytes.Repeat([]byte{'0'}, test.size)
+ src[test.pos] = 'z'
+ dst := bytes.Repeat([]byte{0xa5}, DecodedLen(test.size))
+
+ n, err := Decode(dst, src)
+ if err != InvalidByteError('z') {
+ t.Fatalf("Decode returned error %v, want InvalidByteError(%q)", err, 'z')
+ }
+ wantN := test.pos / 2
+ if n != wantN {
+ t.Fatalf("Decode returned %d, want %d", n, wantN)
+ }
+ if want := make([]byte, n); !bytes.Equal(dst[:n], want) {
+ t.Fatalf("Decode returned %q, want %q", dst[:n], want)
+ }
+ if want := bytes.Repeat([]byte{0xa5}, len(dst)-n); !bytes.Equal(dst[n:], want) {
+ t.Fatalf("Decode wrote past the valid prefix: got %x, want %x", dst[n:], want)
+ }
+ })
+ }
+}
+
+func TestDecodeSIMDAllBytes(t *testing.T) {
+ for _, pos := range []int{0, 15, 16, 31} {
+ for value := 0; value < 256; value++ {
+ c := byte(value)
+ src := bytes.Repeat([]byte{'0'}, 32)
+ src[pos] = c
+ dst := bytes.Repeat([]byte{0xa5}, DecodedLen(len(src)))
+
+ n, err := Decode(dst, src)
+ valid := '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F'
+ if valid {
+ if err != nil || n != len(dst) {
+ t.Fatalf("Decode with byte %#02x at %d returned %d, %v", c, pos, n, err)
+ }
+ if want := decodeScalarForTest(src); !bytes.Equal(dst, want) {
+ t.Fatalf("Decode with byte %#02x at %d returned %x, want %x", c, pos, dst, want)
+ }
+ continue
+ }
+
+ if err != InvalidByteError(c) || n != pos/2 {
+ t.Fatalf("Decode with byte %#02x at %d returned %d, %v; want %d, InvalidByteError", c, pos, n, err, pos/2)
+ }
+ }
+ }
+}
+
+func encodeScalarForTest(src []byte) []byte {
+ dst := make([]byte, EncodedLen(len(src)))
+ for i, b := range src {
+ dst[2*i] = hextable[b>>4]
+ dst[2*i+1] = hextable[b&0x0f]
+ }
+ return dst
+}
+
+func decodeScalarForTest(src []byte) []byte {
+ dst := make([]byte, DecodedLen(len(src)))
+ for i := 0; i < len(src)-1; i += 2 {
+ dst[i/2] = (reverseHexTable[src[i]] << 4) | reverseHexTable[src[i+1]]
+ }
+ return dst
+}
diff --git a/src/encoding/hex/hex_simd_wasm.go b/src/encoding/hex/hex_simd_wasm.go
new file mode 100644
index 0000000..7e76f71
--- /dev/null
+++ b/src/encoding/hex/hex_simd_wasm.go
@@ -0,0 +1,102 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && wasm
+
+package hex
+
+import (
+ "internal/byteorder"
+ "simd/archsimd"
+)
+
+const simdBlockSize = 16
+
+var hexTableVector = archsimd.LoadUint8x16([]byte(hextable)).BitsToInt8()
+var decodePackMask = archsimd.LoadInt8x16([]int8{
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+})
+
+const lowNibbleMask = 0x0f
+const lowByteMask = 0x00ff
+const lowLetter = 0x20 // 'A'...'F' -> 'a'...'f'.
+const digitRangeStart = byte('0')
+const digitRangeSize = byte('9') - digitRangeStart + 1
+const letterRangeStart = byte('a')
+const letterRangeSize = byte('f') - letterRangeStart + 1
+const letterNibbleOffset = 9
+
+// encodeSIMD encodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func encodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ encodedLo, encodedHi := encodeBlock(archsimd.LoadUint8x16(src))
+ encodedLo.Store(dst[:simdBlockSize])
+ encodedHi.Store(dst[simdBlockSize : 2*simdBlockSize])
+
+ src = src[simdBlockSize:]
+ dst = dst[2*simdBlockSize:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// encodeBlock encodes one vector into two consecutive vectors of hexadecimal
+// digits.
+func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
+ lowNibbles := x.And(archsimd.BroadcastUint8x16(lowNibbleMask))
+ highNibbles := x.ShiftAllRight(4)
+
+ lowDigits := hexTableVector.LookupOrZero(lowNibbles.BitsToInt8()).ToBits()
+ highDigits := hexTableVector.LookupOrZero(highNibbles.BitsToInt8()).ToBits()
+
+ encodedLo = highDigits.ExtendLo8ToUint16().
+ Or(lowDigits.ExtendLo8ToUint16().ShiftAllLeft(8)).
+ ReshapeToUint8s()
+ encodedHi = highDigits.ExtendHi8ToUint16().
+ Or(lowDigits.ExtendHi8ToUint16().ShiftAllLeft(8)).
+ ReshapeToUint8s()
+ return encodedLo, encodedHi
+}
+
+// decodeSIMD decodes complete SIMD blocks and returns the number of source
+// bytes consumed.
+func decodeSIMD(dst, src []byte) (processed int) {
+ for len(src) >= simdBlockSize {
+ if invalid := decodeBlock(archsimd.LoadUint8x16(src), dst); invalid {
+ return processed
+ }
+ src = src[simdBlockSize:]
+ dst = dst[simdBlockSize/2:]
+ processed += simdBlockSize
+ }
+ return processed
+}
+
+// decodeBlock decodes one vector of hexadecimal digits into eight bytes.
+// It returns whether the input contained an invalid byte.
+func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
+ lower := input.Or(archsimd.BroadcastUint8x16(lowLetter)) // 'A'...'F' -> 'a'...'f'.
+ isDigit := input.Sub(archsimd.BroadcastUint8x16(digitRangeStart)).
+ Less(archsimd.BroadcastUint8x16(digitRangeSize))
+ isLetter := lower.Sub(archsimd.BroadcastUint8x16(letterRangeStart)).
+ Less(archsimd.BroadcastUint8x16(letterRangeSize))
+ isValid := isDigit.Or(isLetter)
+
+ validBits := isValid.ToInt8x16().ToBits().ReshapeToUint64s()
+ if validBits.GetElem(0) != ^uint64(0) || validBits.GetElem(1) != ^uint64(0) {
+ return true
+ }
+
+ nibble := input.And(archsimd.BroadcastUint8x16(lowNibbleMask)).
+ Add(isLetter.ToInt8x16().ToBits().And(archsimd.BroadcastUint8x16(letterNibbleOffset))).BitsToInt8()
+
+ words := nibble.ToBits().ReshapeToUint16s()
+ packed := words.And(archsimd.BroadcastUint16x8(lowByteMask)).ShiftAllLeft(4).
+ Or(words.ShiftAllRight(8)).ReshapeToUint8s().BitsToInt8().
+ LookupOrZero(decodePackMask).ToBits().ReshapeToUint64s()
+ byteorder.LEPutUint64(dst, packed.GetElem(0))
+ return false
+}
diff --git a/src/encoding/hex/hex_test.go b/src/encoding/hex/hex_test.go
index f90dec5..89f2c41 100644
--- a/src/encoding/hex/hex_test.go
+++ b/src/encoding/hex/hex_test.go
@@ -248,8 +248,9 @@
var sink []byte
func BenchmarkEncode(b *testing.B) {
- for _, size := range []int{256, 1024, 4096, 16384} {
- src := bytes.Repeat([]byte{2, 3, 5, 7, 9, 11, 13, 17}, size/8)
+ for _, size := range []int{8, 15, 16, 17, 31, 32, 33, 64, 256, 1024, 4096, 16384, 1 << 20} {
+ pattern := []byte{2, 3, 5, 7, 9, 11, 13, 17}
+ src := bytes.Repeat(pattern, (size+len(pattern)-1)/len(pattern))[:size]
sink = make([]byte, 2*size)
b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
@@ -262,8 +263,9 @@
}
func BenchmarkDecode(b *testing.B) {
- for _, size := range []int{256, 1024, 4096, 16384} {
- src := bytes.Repeat([]byte{'2', 'b', '7', '4', '4', 'f', 'a', 'a'}, size/8)
+ for _, size := range []int{8, 14, 16, 18, 30, 32, 34, 64, 256, 1024, 4096, 16384, 1 << 20} {
+ pattern := []byte("2B74fAa0")
+ src := bytes.Repeat(pattern, (size+len(pattern)-1)/len(pattern))[:size]
sink = make([]byte, size/2)
b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
index 4959a42..41d6d7e 100644
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -268,9 +268,13 @@
FMT, encoding, encoding/base32, encoding/base64, encoding/binary,
internal/saferio
- < encoding/ascii85, encoding/csv, encoding/gob, encoding/hex,
+ < encoding/ascii85, encoding/csv, encoding/gob,
encoding/pem, encoding/xml, mime;
+ FMT, encoding, encoding/base32, encoding/base64, encoding/binary,
+ internal/saferio, simd/archsimd
+ < encoding/hex;
+
STR, errors
< encoding/json/internal
< encoding/json/internal/jsonflags
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Congratulations on opening your first change. Thank you for your contribution!
Next steps:
A maintainer will review your change and provide feedback. See
https://go.dev/doc/contribute#review for more info and tips to get your
patch through code review.
Most changes in the Go project go through a few rounds of revision. This can be
surprising to people new to the project. The careful, iterative review process
is our way of helping mentor contributors and ensuring that their contributions
have a lasting impact.
During May-July and Nov-Jan the Go project is in a code freeze, during which
little code gets reviewed or merged. If a reviewer responds with a comment like
R=go1.11 or adds a tag like "wait-release", it means that this CL will be
reviewed as part of the next development cycle. See https://go.dev/s/release
for more details.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
Hey thanks, this is great, I only skimmed the AMD64 implementation.
I'll take a seriously look later.
Please add `benchstat` results:
https://pkg.go.dev/golang.org/x/perf/cmd/benchstat
Change-Id: Ic21fa6f0f864d825cfcf45e3d5c7a3bcbdc6e77fPlease split the commits into 3, one per architecture.
Given you are not using github, "all" you need to do is make 3 commits on one branch and push your whole branch at once.
Gerrit will make 3 stacked CLs.
var (
lowNibbleMask128 = [simdBlockSize]byte{
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
}
lowNibbleMask256 = [avx2BlockSize]byte{
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
}Broadcast ?
lowByteMask128 = [simdBlockSize / 2]uint16{
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
}
lowByteMask256 = [avx2BlockSize / 2]uint16{
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
}Wouldn't it make more sense to broadcast `0x00ff` ?
}
decodePairWeights128 = [simdBlockSize]int8{
16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
}
decodePairWeights256 = [avx2BlockSize]int8{
16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
16, 1, 16, 1, 16, 1, 16, 1,
}This could you do an x16 broadcast and then convert it to an x8 (there is no actually instruction to convert x16 → x8 it's free).
}
letterNibbleOffset128 = [simdBlockSize]byte{
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
}
letterNibbleOffset256 = [avx2BlockSize]byte{
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
}Broadcast ?
}
digitRangeBefore128 = [simdBlockSize]int8{
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
}
digitRangeBefore256 = [avx2BlockSize]int8{
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
}Broadcast ?
digitRangeAfter128 = [simdBlockSize]int8{
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
}
digitRangeAfter256 = [avx2BlockSize]int8{
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
}Broadcast ?
lowerLetterRangeBefore128 = [simdBlockSize]int8{
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
}
lowerLetterRangeBefore256 = [avx2BlockSize]int8{
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60,
}Broadcast ?
lowerLetterRangeAfter128 = [simdBlockSize]int8{
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
}
lowerLetterRangeAfter256 = [avx2BlockSize]int8{
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67, 0x67,
}Broadcast ?
normalizeLetterValue128 = [simdBlockSize]int8{
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
}
normalizeLetterValue256 = [avx2BlockSize]int8{
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
}Broadcast ?
// encodeBlock encodes one 128-bit vector into two consecutive vectors of
// hexadecimal digits using AVX instructions.
func encodeBlock(x archsimd.Uint8x16) (encodedLo, encodedHi archsimd.Uint8x16) {
nibbleMask := archsimd.LoadUint8x16Array(&lowNibbleMask128)
lowNibbles := x.And(nibbleMask)
highNibbles := x.AsUint16x8().ShiftAllRight(4).AsUint8x16().And(nibbleMask)
hexTable := archsimd.LoadUint8x16Array(&hexTable128)
lowDigits := hexTable.PermuteOrZero(lowNibbles.AsInt8x16())
highDigits := hexTable.PermuteOrZero(highNibbles.AsInt8x16())
lowByteMask := archsimd.LoadUint16x8Array(&lowByteMask128)
highWords := highDigits.AsUint16x8()
lowWords := lowDigits.AsUint16x8()
evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))
return evenPairs.InterleaveLo(oddPairs).AsUint8x16(), evenPairs.InterleaveHi(oddPairs).AsUint8x16()
}
// encodeBlockAVX2 encodes one 256-bit vector into two consecutive vectors of
// hexadecimal digits.
func encodeBlockAVX2(x archsimd.Uint8x32) (encodedLo, encodedHi archsimd.Uint8x32) {
nibbleMask := archsimd.LoadUint8x32Array(&lowNibbleMask256)
lowNibbles := x.And(nibbleMask)
highNibbles := x.AsUint16x16().ShiftAllRight(4).AsUint8x32().And(nibbleMask)
hexTable := archsimd.LoadUint8x32Array(&hexTable256)
lowDigits := hexTable.PermuteOrZeroGrouped(lowNibbles.AsInt8x32())
highDigits := hexTable.PermuteOrZeroGrouped(highNibbles.AsInt8x32())
lowByteMask := archsimd.LoadUint16x16Array(&lowByteMask256)
highWords := highDigits.AsUint16x16()
lowWords := lowDigits.AsUint16x16()
evenPairs := highWords.And(lowByteMask).Or(lowWords.ShiftAllLeft(8))
oddPairs := highWords.ShiftAllRight(8).Or(lowWords.And(lowByteMask.ShiftAllLeft(8)))
interleavedLo := evenPairs.InterleaveLoGrouped(oddPairs)
interleavedHi := evenPairs.InterleaveHiGrouped(oddPairs)
encodedLo = interleavedLo.ConcatPermute128Scalars(0, 2, interleavedHi).AsUint8x32()
encodedHi = interleavedLo.ConcatPermute128Scalars(1, 3, interleavedHi).AsUint8x32()
return encodedLo, encodedHi
}I'm not sure why theses need to be a function, at first glance (I didn't checked) they look too long to be inlined, and it's not good to have a function call in a hotloop.
Also you could prepare stuff like `nibbleMask` `hexTable`, ... before the loop once and just keep it in registers.
// decodeBlock decodes one 128-bit vector of hexadecimal digits into eight bytes.
// It returns whether the input contained an invalid byte.
func decodeBlock(input archsimd.Uint8x16, dst []byte) (invalid bool) {
c := input.BitsToInt8()
digitRangeBefore := archsimd.LoadInt8x16Array(&digitRangeBefore128)
digitRangeAfter := archsimd.LoadInt8x16Array(&digitRangeAfter128)
lowerLetterRangeBefore := archsimd.LoadInt8x16Array(&lowerLetterRangeBefore128)
lowerLetterRangeAfter := archsimd.LoadInt8x16Array(&lowerLetterRangeAfter128)
lower := input.Or(archsimd.LoadInt8x16Array(&normalizeLetterValue128).AsUint8x16())
lowerC := lower.BitsToInt8()
// Use strict bounds because signed Greater maps directly to VPCMPGTB,
// while inclusive comparisons require multiple instructions.
isDigit := c.Greater(digitRangeBefore).
And(digitRangeAfter.Greater(c))
isLetter := lowerC.Greater(lowerLetterRangeBefore).
And(lowerLetterRangeAfter.Greater(lowerC))
isValid := isDigit.Or(isLetter)
if isValid.ToBits() != 0xffff {
return true
}
nibble := input.And(archsimd.LoadUint8x16Array(&lowNibbleMask128)).
Add(isLetter.ToInt8x16().ToBits().And(archsimd.LoadUint8x16Array(&letterNibbleOffset128)))
decodeNibbles(nibble, dst)
return false
}
// decodeBlockAVX2 decodes one 256-bit vector of hexadecimal digits into
// sixteen bytes using AVX2 instructions.
// It returns whether the input contained an invalid byte.
func decodeBlockAVX2(input archsimd.Uint8x32, dst []byte) (invalid bool) {
c := input.BitsToInt8()
lower := input.Or(archsimd.LoadInt8x32Array(&normalizeLetterValue256).AsUint8x32())
lowerC := lower.BitsToInt8()
// See the 128-bit path for why these use strict bounds.
isDigit := c.Greater(archsimd.LoadInt8x32Array(&digitRangeBefore256)).
And(archsimd.LoadInt8x32Array(&digitRangeAfter256).Greater(c))
isLetter := lowerC.Greater(archsimd.LoadInt8x32Array(&lowerLetterRangeBefore256)).
And(archsimd.LoadInt8x32Array(&lowerLetterRangeAfter256).Greater(lowerC))
isValid := isDigit.Or(isLetter)
if isValid.ToBits() != 0xffffffff {
return true
}
nibble := input.And(archsimd.LoadUint8x32Array(&lowNibbleMask256)).
Add(isLetter.ToInt8x32().ToBits().And(archsimd.LoadUint8x32Array(&letterNibbleOffset256)))
packed := nibble.DotProductPairsSaturated(archsimd.LoadInt8x32Array(&decodePairWeights256)).
ToBits().AsUint8x32().
PermuteOrZeroGrouped(archsimd.LoadInt8x32Array(&decodePackMask256))
decoded := packed.GetLo().AsUint64x2().
ConcatPermuteScalars(0, 2, packed.GetHi().AsUint64x2()).AsUint8x16()
decoded.Store(dst[:16])
return false
}I'm not sure why theses need to be a function, at first glance (I didn't checked) they look too long to be inlined, and it's not good to have a function call in a hotloop.
Also you could prepare stuff like `nibbleMask` `hexTable`, ... before the loop once and just keep it in registers.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |