[dev.simd] runtime/gc: generate greentea expand kernels in Go SIMD
This CL adds a new generator to runtime/gc/scan that generates Go codes
using the new simd package.
This CL also includes the plumbing, it will use the Go SIMD kernels if
GOEXPERIMENT=simd is on.
diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
index 93abfd3..9cd5b099 100644
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -87,6 +87,7 @@
internal/profilerecord,
internal/trace/tracev2,
math/bits,
+ simd,
structs
< internal/bytealg
< internal/stringslite
@@ -826,7 +827,8 @@
os,
reflect,
strings,
- sync
+ sync,
+ regexp
< internal/runtime/gc/internal/gen;
regexp, internal/txtar, internal/trace, internal/trace/raw
diff --git a/src/internal/runtime/gc/scan/expand_amd64.go b/src/internal/runtime/gc/scan/expand_amd64.go
index 9bea471..c5764e3 100644
--- a/src/internal/runtime/gc/scan/expand_amd64.go
+++ b/src/internal/runtime/gc/scan/expand_amd64.go
@@ -4,7 +4,11 @@
package scan
-import "internal/runtime/gc"
+import (
+ "internal/runtime/gc"
+ "simd"
+ "unsafe"
+)
// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
// where f is the word size of objects in sizeClass.
@@ -20,3 +24,14 @@
//
// It is defined in assembly.
var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
+
+// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
+// where f is the word size of objects in sizeClass.
+//
+// This is a testing entrypoint to the expanders used by scanSpanPacked*.
+func ExpandAVX512SIMD(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
+ v1, v2 := gcExpandersAVX512SIMD[sizeClass](unsafe.Pointer(packed))
+ v1.Store((*[8]uint64)(unsafe.Pointer(unpacked)))
+ v2.Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unpacked)) + 64)))
+ simd.ClearAVXUpperBits()
+}
diff --git a/src/internal/runtime/gc/scan/expand_amd64_test.go b/src/internal/runtime/gc/scan/expand_amd64_test.go
index a8f5b88..692bc7c 100644
--- a/src/internal/runtime/gc/scan/expand_amd64_test.go
+++ b/src/internal/runtime/gc/scan/expand_amd64_test.go
@@ -17,3 +17,10 @@
}
testExpand(t, scan.ExpandAVX512)
}
+
+func TestExpandAVX512SIMD(t *testing.T) {
+ if !scan.CanAVX512() {
+ t.Skip("no AVX512")
+ }
+ testExpand(t, scan.ExpandAVX512SIMD)
+}
diff --git a/src/internal/runtime/gc/scan/expand_test.go b/src/internal/runtime/gc/scan/expand_test.go
index 692817d..2e75574 100644
--- a/src/internal/runtime/gc/scan/expand_test.go
+++ b/src/internal/runtime/gc/scan/expand_test.go
@@ -23,7 +23,7 @@
for i := range want {
if got[i] != want[i] {
- t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
+ t.Errorf("expansion differs from reference at bit %d, sizeClass=%d", i*goarch.PtrSize, sizeClass)
if goarch.PtrSize == 4 {
t.Logf("got: %032b", got[i])
t.Logf("want: %032b", want[i])
diff --git a/src/internal/runtime/gc/scan/expanders_amd64.go b/src/internal/runtime/gc/scan/expanders_amd64.go
new file mode 100644
index 0000000..ea6c643
--- /dev/null
+++ b/src/internal/runtime/gc/scan/expanders_amd64.go
@@ -0,0 +1,1528 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+ "simd"
+ "unsafe"
+)
+
+var gcExpandersAVX512SIMD = [68]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+ nil,
+ expandAVX512_1SIMD,
+ expandAVX512_2SIMD,
+ expandAVX512_3SIMD,
+ expandAVX512_4SIMD,
+ expandAVX512_6SIMD,
+ expandAVX512_8SIMD,
+ expandAVX512_10SIMD,
+ expandAVX512_12SIMD,
+ expandAVX512_14SIMD,
+ expandAVX512_16SIMD,
+ expandAVX512_18SIMD,
+ expandAVX512_20SIMD,
+ expandAVX512_22SIMD,
+ expandAVX512_24SIMD,
+ expandAVX512_26SIMD,
+ expandAVX512_28SIMD,
+ expandAVX512_30SIMD,
+ expandAVX512_32SIMD,
+ expandAVX512_36SIMD,
+ expandAVX512_40SIMD,
+ expandAVX512_44SIMD,
+ expandAVX512_48SIMD,
+ expandAVX512_52SIMD,
+ expandAVX512_56SIMD,
+ expandAVX512_60SIMD,
+ expandAVX512_64SIMD,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+}
+
+func expandAVX512_1SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src) + 64))).AsUint8x64()
+ return x.AsUint64x8(), y.AsUint64x8()
+}
+
+var expandAVX512_2SIMD_mat0 = [8]uint64{
+ 0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080,
+ 0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080,
+}
+var expandAVX512_2SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+ 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918,
+}
+var expandAVX512_2SIMD_inShuf1 = [8]uint64{
+ 0x2726252423222120, 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x2f2e2d2c2b2a2928,
+ 0x3736353433323130, 0x3736353433323130, 0x3f3e3d3c3b3a3938, 0x3f3e3d3c3b3a3938,
+}
+var expandAVX512_2SIMD_outShufLo = [8]uint64{
+ 0x0b030a0209010800, 0x0f070e060d050c04, 0x1b131a1219111810, 0x1f171e161d151c14,
+ 0x2b232a2229212820, 0x2f272e262d252c24, 0x3b333a3239313830, 0x3f373e363d353c34,
+}
+
+func expandAVX512_2SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_2SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_2SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_2SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_2SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_3SIMD_mat0 = [8]uint64{
+ 0x0101010202020404, 0x0408080810101020, 0x2020404040808080, 0x0101010202020404,
+ 0x0408080810101020, 0x2020404040808080, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_3SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3SIMD_inShuf1 = [8]uint64{
+ 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918,
+ 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3SIMD_inShuf2 = [8]uint64{
+ 0x2726252423222120, 0x2726252423222120, 0x2726252423222120, 0xffffffffff2a2928,
+ 0xffffffffff2a2928, 0xffffffffffff2928, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3SIMD_outShufLo = [8]uint64{
+ 0x0a02110901100800, 0x05140c04130b0312, 0x170f07160e06150d, 0x221a292119282018,
+ 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25, 0x4a42514941504840, 0x45544c44534b4352,
+}
+var expandAVX512_3SIMD_outShufHi = [8]uint64{
+ 0x170f07160e06150d, 0x221a292119282018, 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25,
+ 0x4a42514941504840, 0x45544c44534b4352, 0x574f47564e46554d, 0x625a696159686058,
+}
+
+func expandAVX512_3SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_3SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_3SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_3SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_3SIMD_inShuf2).AsUint8x64()
+ v11 := simd.LoadUint64x8(&expandAVX512_3SIMD_outShufLo).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_3SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v0.Permute(v8)
+ v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v12 := v4.Permute2(v7, v11)
+ v14 := v7.Permute2(v10, v13)
+ return v12.AsUint64x8(), v14.AsUint64x8()
+}
+
+var expandAVX512_4SIMD_mat0 = [8]uint64{
+ 0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080,
+ 0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080,
+}
+var expandAVX512_4SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+}
+var expandAVX512_4SIMD_inShuf1 = [8]uint64{
+ 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1716151413121110,
+ 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918,
+}
+var expandAVX512_4SIMD_outShufLo = [8]uint64{
+ 0x1911090118100800, 0x1b130b031a120a02, 0x1d150d051c140c04, 0x1f170f071e160e06,
+ 0x3931292138302820, 0x3b332b233a322a22, 0x3d352d253c342c24, 0x3f372f273e362e26,
+}
+
+func expandAVX512_4SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_4SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_4SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_4SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_4SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_6SIMD_mat0 = [8]uint64{
+ 0x0101010101010202, 0x0202020204040404, 0x0404080808080808, 0x1010101010102020,
+ 0x2020202040404040, 0x4040808080808080, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_6SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+ 0x0706050403020100, 0x0706050403020100, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6SIMD_inShuf1 = [8]uint64{
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6SIMD_inShuf2 = [8]uint64{
+ 0xffff151413121110, 0xffff151413121110, 0xffffff1413121110, 0xffffff1413121110,
+ 0xffffff1413121110, 0xffffff1413121110, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6SIMD_outShufLo = [8]uint64{
+ 0x0901282018100800, 0x1a120a0229211911, 0x2b231b130b032a22, 0x0d052c241c140c04,
+ 0x1e160e062d251d15, 0x2f271f170f072e26, 0x4941686058504840, 0x5a524a4269615951,
+}
+var expandAVX512_6SIMD_outShufHi = [8]uint64{
+ 0x2b231b130b032a22, 0x0d052c241c140c04, 0x1e160e062d251d15, 0x2f271f170f072e26,
+ 0x4941686058504840, 0x5a524a4269615951, 0x6b635b534b436a62, 0x4d456c645c544c44,
+}
+
+func expandAVX512_6SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_6SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_6SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_6SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_6SIMD_inShuf2).AsUint8x64()
+ v11 := simd.LoadUint64x8(&expandAVX512_6SIMD_outShufLo).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_6SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v0.Permute(v8)
+ v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v12 := v4.Permute2(v7, v11)
+ v14 := v7.Permute2(v10, v13)
+ return v12.AsUint64x8(), v14.AsUint64x8()
+}
+
+var expandAVX512_8SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_8SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+}
+var expandAVX512_8SIMD_inShuf1 = [8]uint64{
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+}
+var expandAVX512_8SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03,
+ 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07,
+}
+
+func expandAVX512_8SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_8SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_8SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_8SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_8SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_10SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808,
+ 0x0808080808080808, 0x1010101010101010, 0x1010202020202020, 0x2020202040404040,
+}
+var expandAVX512_10SIMD_inShuf0 = [8]uint64{
+ 0xff06050403020100, 0xff06050403020100, 0xff06050403020100, 0xff06050403020100,
+ 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100,
+}
+var expandAVX512_10SIMD_mat1 = [8]uint64{
+ 0x4040404040408080, 0x8080808080808080, 0x0808080808080808, 0x1010101010101010,
+ 0x1010202020202020, 0x2020202040404040, 0x4040404040408080, 0x8080808080808080,
+}
+var expandAVX512_10SIMD_inShuf1 = [8]uint64{
+ 0xffff050403020100, 0xffff050403020100, 0xff0c0b0a09080706, 0xff0c0b0a09080706,
+ 0xff0c0b0a09080706, 0xff0c0b0a09080706, 0xffff0b0a09080706, 0xffff0b0a09080706,
+}
+var expandAVX512_10SIMD_mat2 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_10SIMD_inShuf2 = [8]uint64{
+ 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_10SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x2921191109014840, 0x1a120a0249413931, 0x0b034a423a322a22,
+ 0x4b433b332b231b13, 0x3c342c241c140c04, 0x2d251d150d054c44, 0x1e160e064d453d35,
+}
+var expandAVX512_10SIMD_outShufHi = [8]uint64{
+ 0x4840383028201810, 0x3931292119115850, 0x2a221a1259514941, 0x1b135a524a423a32,
+ 0x5b534b433b332b23, 0x4c443c342c241c14, 0x3d352d251d155c54, 0x2e261e165d554d45,
+}
+
+func expandAVX512_10SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_10SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_10SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_10SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_10SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_10SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_10SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_10SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_10SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_12SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_12SIMD_inShuf0 = [8]uint64{
+ 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100,
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+}
+var expandAVX512_12SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_12SIMD_inShuf1 = [8]uint64{
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+ 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605,
+}
+var expandAVX512_12SIMD_mat2 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+}
+var expandAVX512_12SIMD_inShuf2 = [8]uint64{
+ 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605,
+ 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706,
+}
+var expandAVX512_12SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x1911090158504840, 0x5951494139312921, 0x3a322a221a120a02,
+ 0x1b130b035a524a42, 0x5b534b433b332b23, 0x3c342c241c140c04, 0x1d150d055c544c44,
+}
+var expandAVX512_12SIMD_outShufHi = [8]uint64{
+ 0x5850484038302820, 0x3931292178706860, 0x7971696159514941, 0x5a524a423a322a22,
+ 0x3b332b237a726a62, 0x7b736b635b534b43, 0x5c544c443c342c24, 0x3d352d257c746c64,
+}
+
+func expandAVX512_12SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_12SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_12SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_12SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_12SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_12SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_12SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_12SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_12SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_14SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_14SIMD_inShuf0 = [8]uint64{
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+}
+var expandAVX512_14SIMD_mat1 = [8]uint64{
+ 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040808080808080, 0x8080808080808080, 0x1010101010102020, 0x2020202020202020,
+}
+var expandAVX512_14SIMD_inShuf1 = [8]uint64{
+ 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100,
+ 0xffffffff03020100, 0xffffffff03020100, 0xffffff0807060504, 0xffffff0807060504,
+}
+var expandAVX512_14SIMD_mat2 = [8]uint64{
+ 0x2020202040404040, 0x4040404040404040, 0x4040808080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+}
+var expandAVX512_14SIMD_inShuf2 = [8]uint64{
+ 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504,
+ 0xffffff0908070605, 0xffffff0908070605, 0xffffffff08070605, 0xffffffff08070605,
+}
+var expandAVX512_14SIMD_mat3 = [8]uint64{
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_14SIMD_inShuf3 = [8]uint64{
+ 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_14SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x0901686058504840, 0x4941393129211911, 0x1a120a0269615951,
+ 0x5a524a423a322a22, 0x2b231b130b036a62, 0x6b635b534b433b33, 0x3c342c241c140c04,
+}
+var expandAVX512_14SIMD_outShufHi0 = [8]uint64{
+ 0x6860585048403830, 0x3931ffffffff7870, 0x7971696159514941, 0x4a423a32ffffffff,
+ 0xffff7a726a625a52, 0x5b534b433b33ffff, 0xffffffff7b736b63, 0x6c645c544c443c34,
+}
+var expandAVX512_14SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffff18100800ffff, 0xffffffffffffffff, 0xffffffff19110901,
+ 0x0a02ffffffffffff, 0xffffffffffff1a12, 0x1b130b03ffffffff, 0xffffffffffffffff,
+}
+
+func expandAVX512_14SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_14SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_14SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_14SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xff0ffc3ff0ffc3ff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0xf003c00f003c00)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_16SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_16SIMD_inShuf0 = [8]uint64{
+ 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000,
+ 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000,
+}
+var expandAVX512_16SIMD_inShuf1 = [8]uint64{
+ 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404,
+ 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404,
+}
+var expandAVX512_16SIMD_outShufLo = [8]uint64{
+ 0x1918111009080100, 0x3938313029282120, 0x1b1a13120b0a0302, 0x3b3a33322b2a2322,
+ 0x1d1c15140d0c0504, 0x3d3c35342d2c2524, 0x1f1e17160f0e0706, 0x3f3e37362f2e2726,
+}
+
+func expandAVX512_16SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_16SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_16SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_16SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_16SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_18SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_18SIMD_inShuf0 = [8]uint64{
+ 0x0303020201010000, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100,
+ 0xffffffff03020100, 0xffffffff03020100, 0x0303020201010000, 0xff03020201010000,
+}
+var expandAVX512_18SIMD_mat1 = [8]uint64{
+ 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040404040408080, 0x8080808080808080, 0x1010101010101010, 0x1010202020202020,
+}
+var expandAVX512_18SIMD_inShuf1 = [8]uint64{
+ 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100,
+ 0xffffffffff020100, 0xffff020201010000, 0xff06060505040403, 0xffffffff06050403,
+}
+var expandAVX512_18SIMD_mat2 = [8]uint64{
+ 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040404040408080,
+ 0x8080808080808080, 0x0101010101010101, 0x0101020202020202, 0x0202020202020202,
+}
+var expandAVX512_18SIMD_inShuf2 = [8]uint64{
+ 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403,
+ 0x0606050504040303, 0x0707060605050404, 0xffffffffff060504, 0xffffffffff060504,
+}
+var expandAVX512_18SIMD_mat3 = [8]uint64{
+ 0x0202020204040404, 0x0404040404040404, 0x0404040404040808, 0x0808080808080808,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_18SIMD_inShuf3 = [8]uint64{
+ 0xffffffffff060504, 0xffffffffff060504, 0xffffffffff060504, 0xffff060605050404,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_18SIMD_outShufLo = [8]uint64{
+ 0x3028201810080100, 0x6058504840393831, 0x2119110903026968, 0x5149413b3a333229,
+ 0x120a05046b6a6159, 0x423d3c35342a221a, 0x07066d6c625a524a, 0x3e37362b231b130b,
+}
+var expandAVX512_18SIMD_outShufHi0 = [8]uint64{
+ 0x6160585048403830, 0xffffffff78706968, 0x59514941393231ff, 0xffff79716b6a6362,
+ 0x4a423a3433ffffff, 0x7a726d6c65645a52, 0x3b3635ffffffffff, 0x6f6e67665b534b43,
+}
+var expandAVX512_18SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0x18100800ffffffff, 0xffffffffffffff19, 0x0901ffffffffffff,
+ 0xffffffffff1b1a11, 0xffffffffffffffff, 0xffffff1d1c120a02, 0xffffffffffffffff,
+}
+
+func expandAVX512_18SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_18SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_18SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_18SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xffe0fff83ffe0fff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x1f0007c001f000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_20SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_20SIMD_inShuf0 = [8]uint64{
+ 0x0303020201010000, 0xffffffff03020100, 0xff03020201010000, 0xffff020201010000,
+ 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100,
+}
+var expandAVX512_20SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x0808080808080808,
+}
+var expandAVX512_20SIMD_inShuf1 = [8]uint64{
+ 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000,
+ 0xff06060505040403, 0x0606050504040303, 0xffffffff06050403, 0xffff050504040303,
+}
+var expandAVX512_20SIMD_mat2 = [8]uint64{
+ 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040,
+ 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202,
+}
+var expandAVX512_20SIMD_inShuf2 = [8]uint64{
+ 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303,
+ 0xffffffffff050403, 0xffff050504040303, 0xffff060605050404, 0xffffffffff060504,
+}
+var expandAVX512_20SIMD_outShufLo = [8]uint64{
+ 0x2019181110080100, 0x4841403831302928, 0x1209030259585049, 0x33322b2a211b1a13,
+ 0x5b5a514b4a434239, 0x221d1c15140a0504, 0x4c45443a35342d2c, 0x160b07065d5c524d,
+}
+var expandAVX512_20SIMD_outShufHi = [8]uint64{
+ 0x4140393830292820, 0x6968605958515048, 0x312b2a2221787170, 0x5a53524943423b3a,
+ 0x237973726b6a615b, 0x45443d3c322d2c24, 0x6d6c625d5c55544a, 0x332f2e26257a7574,
+}
+
+func expandAVX512_20SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_20SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_20SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_20SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_20SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_20SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_20SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_20SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_20SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_22SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_22SIMD_inShuf0 = [8]uint64{
+ 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100,
+ 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000,
+}
+var expandAVX512_22SIMD_mat1 = [8]uint64{
+ 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040808080808080, 0x8080808080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_22SIMD_inShuf1 = [8]uint64{
+ 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000,
+ 0xffffffffff020100, 0xffffffff01010000, 0xffff040403030202, 0xffff050504040303,
+}
+var expandAVX512_22SIMD_mat2 = [8]uint64{
+ 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404,
+ 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, 0x1010101010102020,
+}
+var expandAVX512_22SIMD_inShuf2 = [8]uint64{
+ 0xffffffffff050403, 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303,
+ 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303, 0xffffffffff050403,
+}
+var expandAVX512_22SIMD_mat3 = [8]uint64{
+ 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040808080808080,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_22SIMD_inShuf3 = [8]uint64{
+ 0xffff050504040303, 0xffffffffff050403, 0xffffff0504040303, 0xffffffffffff0403,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_22SIMD_outShufLo = [8]uint64{
+ 0x2120181110080100, 0x4948403938313028, 0x0302696860595850, 0x3229232219131209,
+ 0x5a514b4a413b3a33, 0x140a05046b6a615b, 0x3c35342a25241a15, 0x625d5c524d4c423d,
+}
+var expandAVX512_22SIMD_outShufHi0 = [8]uint64{
+ 0x5049484039383130, 0x7871706968605958, 0x3332ffffffffffff, 0x5b5a514b4a413b3a,
+ 0xffff7973726b6a61, 0x3d3c3534ffffffff, 0x6c625d5c524d4c42, 0xffffffff7a75746d,
+}
+var expandAVX512_22SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffff181110080100, 0xffffffffffffffff,
+ 0x0302ffffffffffff, 0xffffffff19131209, 0xffffffffffffffff, 0x140a0504ffffffff,
+}
+
+func expandAVX512_22SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_22SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_22SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_22SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xffff03fffc0ffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0xf0000fc0003f0000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_24SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_24SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0x0202010101000000, 0x0202010101000000, 0x0202010101000000,
+ 0x0202010101000000, 0xff02010101000000, 0xffff010101000000, 0xffff010101000000,
+}
+var expandAVX512_24SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02,
+ 0xffffffffffffff02, 0x0404040303030202, 0x0404030303020202, 0x0404030303020202,
+}
+var expandAVX512_24SIMD_mat2 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x4040404040404040, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_24SIMD_inShuf2 = [8]uint64{
+ 0x0505040404030303, 0x0505040404030303, 0x0505040404030303, 0xffff040404030303,
+ 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffff04, 0xffffffffffffff05,
+}
+var expandAVX512_24SIMD_mat3 = [8]uint64{
+ 0x0202020202020202, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_24SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffffff05, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_24SIMD_outShufLo = [8]uint64{
+ 0x11100a0908020100, 0x282221201a191812, 0x3a39383231302a29, 0x14130d0c0b050403,
+ 0x2b2524231d1c1b15, 0x3d3c3b3534332d2c, 0x1716480f0e400706, 0x2e602726581f1e50,
+}
+var expandAVX512_24SIMD_outShufHi0 = [8]uint64{
+ 0x3a39383231302928, 0x51504a4948424140, 0x2a6261605a595852, 0x3d3c3b3534332c2b,
+ 0x54534d4c4b454443, 0x2d6564635d5c5b55, 0x703f3e6837362f2e, 0x5756ff4f4e784746,
+}
+var expandAVX512_24SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff00ffffffffff,
+}
+
+func expandAVX512_24SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_24SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_24SIMD_mat2).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf2).AsUint8x64()
+ v12 := simd.LoadUint64x8(&expandAVX512_24SIMD_mat3).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf3).AsUint8x64()
+ v16 := simd.LoadUint64x8(&expandAVX512_24SIMD_outShufLo).AsUint8x64()
+ v18 := simd.LoadUint64x8(&expandAVX512_24SIMD_outShufHi0).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_24SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v10 := v0.Permute(v9)
+ v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0)
+ v14 := v0.Permute(v13)
+ v15 := v14.GaloisFieldAffineTransform(v12.AsUint64x8(), 0)
+ v17 := v4.Permute2(v7, v16)
+ u0 := uint64(0xdfffffffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v20 := v7.Permute2(v11, v18).Masked(m0)
+ u1 := uint64(0x2000000000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v21 := v15.Permute(v19).Masked(m1)
+ v22 := v20.Or(v21)
+ return v17.AsUint64x8(), v22.AsUint64x8()
+}
+
+var expandAVX512_26SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_26SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100,
+ 0xffff020201010000, 0xffffffffff020100, 0x0202010101000000, 0xffff010101000000,
+}
+var expandAVX512_26SIMD_mat1 = [8]uint64{
+ 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040404040408080, 0x8080808080808080, 0x0101010101010101, 0x0808080808080808,
+}
+var expandAVX512_26SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffff0100, 0xffffffff01010000, 0xffffffffffff0100, 0xffffffff01010000,
+ 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0xff04040403030302,
+}
+var expandAVX512_26SIMD_mat2 = [8]uint64{
+ 0x1010101010101010, 0x1010202020202020, 0x2020202020202020, 0x2020202040404040,
+ 0x4040404040404040, 0x4040404040408080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_26SIMD_inShuf2 = [8]uint64{
+ 0x0404030303020202, 0xffffffffff040302, 0xffff040403030202, 0xffffffffff040302,
+ 0xffff040403030202, 0xffffffffff040302, 0xff04030303020202, 0xffff040404030303,
+}
+var expandAVX512_26SIMD_mat3 = [8]uint64{
+ 0x0101020202020202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404,
+ 0x0404040404040808, 0x1010101010101010, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_26SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffff0403, 0xffffffff04040303, 0xffffffffffff0403, 0xffffffff04040303,
+ 0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_26SIMD_outShufLo = [8]uint64{
+ 0x2018111008020100, 0x3a39383231302821, 0x6860595850494840, 0x1312090504036a69,
+ 0x3b35343329232219, 0x5b5a514b4a413d3c, 0x0a7007066d6c6b61, 0x37362a25241a1514,
+}
+var expandAVX512_26SIMD_outShufHi0 = [8]uint64{
+ 0x5851504842414038, 0x7978727170686160, 0xffffffffffffff7a, 0x52494544433b3a39,
+ 0x7574736963625953, 0xffffffffff7d7c7b, 0xff47463e3d3cffff, 0x766a65645a55544a,
+}
+var expandAVX512_26SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x20191810090800ff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0x1a110b0a01ffffff, 0x28ffffffffff211b, 0xffffffffffffffff,
+}
+
+func expandAVX512_26SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_26SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_26SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_26SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xff7c07ffff01ffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x83f80000fe0000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_28SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_28SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0xffffffffff020100, 0x0202010101000000, 0xff02010101000000,
+ 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100,
+}
+var expandAVX512_28SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808,
+}
+var expandAVX512_28SIMD_inShuf1 = [8]uint64{
+ 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000,
+ 0xffffffffffffff02, 0xffffffffffffff02, 0x0404040303030202, 0xffffffffff040302,
+}
+var expandAVX512_28SIMD_mat2 = [8]uint64{
+ 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020,
+ 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_28SIMD_inShuf2 = [8]uint64{
+ 0x0404030303020202, 0x0404030303020202, 0xffffffffffff0302, 0xffff030303020202,
+ 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303,
+}
+var expandAVX512_28SIMD_mat3 = [8]uint64{
+ 0x0101010102020202, 0x0202020202020202, 0x0808080808080808, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_28SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffff0403, 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_28SIMD_outShufLo = [8]uint64{
+ 0x1812111008020100, 0x31302a2928201a19, 0x4a49484241403832, 0x090504035a595850,
+ 0x2b211d1c1b151413, 0x4443393534332d2c, 0x5d5c5b514d4c4b45, 0x1e6817160a600706,
+}
+var expandAVX512_28SIMD_outShufHi0 = [8]uint64{
+ 0x4948424140383130, 0x6261605a5958504a, 0xff7a797872717068, 0x4339343332ffffff,
+ 0x5c5b514d4c4b4544, 0x757473696564635d, 0x35ffffffff7d7c7b, 0x4f4eff47463a3736,
+}
+var expandAVX512_28SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x00ffffffffffffff, 0xffffffffff0a0908,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xff0d0c0b01ffffff, 0xffff10ffffffffff,
+}
+
+func expandAVX512_28SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_28SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_28SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_28SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xdf87fffff87fffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x2078000007800000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_30SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_30SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0xffffffffff020100, 0xffff010101000000, 0xffffffffffff0100,
+ 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000,
+}
+var expandAVX512_30SIMD_mat1 = [8]uint64{
+ 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040808080808080, 0x8080808080808080, 0x0101010101010101, 0x0202020202020202,
+}
+var expandAVX512_30SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000,
+ 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0x0404030303020202,
+}
+var expandAVX512_30SIMD_mat2 = [8]uint64{
+ 0x0202020204040404, 0x0404040404040404, 0x0404080808080808, 0x0808080808080808,
+ 0x1010101010101010, 0x1010101010102020, 0x2020202020202020, 0x2020202040404040,
+}
+var expandAVX512_30SIMD_inShuf2 = [8]uint64{
+ 0xffffffffff040302, 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202,
+ 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffffffffffff0302,
+}
+var expandAVX512_30SIMD_mat3 = [8]uint64{
+ 0x4040404040404040, 0x4040808080808080, 0x8080808080808080, 0x0101010101010101,
+ 0x0101010101010202, 0x0202020202020202, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_30SIMD_inShuf3 = [8]uint64{
+ 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303,
+ 0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_30SIMD_outShufLo = [8]uint64{
+ 0x1812111008020100, 0x3832313028222120, 0x58504a4948403a39, 0x04036a6968605a59,
+ 0x2423191514130905, 0x3d3c3b3534332925, 0x5d5c5b514d4c4b41, 0x0a7007066d6c6b61,
+}
+var expandAVX512_30SIMD_outShufHi0 = [8]uint64{
+ 0x504a4948403a3938, 0x70686261605a5958, 0xffffffffff787271, 0x3c3bffffffffffff,
+ 0x5c5b514d4c4b413d, 0x757473696564635d, 0xffffffffffffff79, 0x42ff3f3effffffff,
+}
+var expandAVX512_30SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x1008020100ffffff, 0xffff201a19181211,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x15141309050403ff, 0xff28ffff211d1c1b,
+}
+
+func expandAVX512_30SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_30SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_30SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_30SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xb001ffffc007ffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x4ffe00003ff80000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_32SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_32SIMD_inShuf0 = [8]uint64{
+ 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000,
+ 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000,
+}
+var expandAVX512_32SIMD_inShuf1 = [8]uint64{
+ 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202,
+ 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202,
+}
+var expandAVX512_32SIMD_outShufLo = [8]uint64{
+ 0x0b0a090803020100, 0x1b1a191813121110, 0x2b2a292823222120, 0x3b3a393833323130,
+ 0x0f0e0d0c07060504, 0x1f1e1d1c17161514, 0x2f2e2d2c27262524, 0x3f3e3d3c37363534,
+}
+
+func expandAVX512_32SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_32SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_32SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_32SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_32SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_36SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_36SIMD_inShuf0 = [8]uint64{
+ 0x0101010100000000, 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000,
+ 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000, 0xffffffffffff0100,
+}
+var expandAVX512_36SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_36SIMD_inShuf1 = [8]uint64{
+ 0x0101010100000000, 0xffffff0100000000, 0xffffffffffffff00, 0xffffffff00000000,
+ 0xff02020202010101, 0xffffffffffff0201, 0x0202020201010101, 0x0303030302020202,
+}
+var expandAVX512_36SIMD_mat2 = [8]uint64{
+ 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808,
+ 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020,
+}
+var expandAVX512_36SIMD_inShuf2 = [8]uint64{
+ 0xffffffffffff0302, 0x0303030302020202, 0x0303030302020202, 0xffffffffffff0302,
+ 0x0303030302020202, 0xffff030302020202, 0xffffffffffffff02, 0xffffffff02020202,
+}
+var expandAVX512_36SIMD_outShufLo = [8]uint64{
+ 0x1211100803020100, 0x2928201b1a191813, 0x4038333231302b2a, 0x504b4a4948434241,
+ 0x070605045b5a5958, 0x1e1d1c1716151409, 0x35342f2e2d2c211f, 0x4c47464544393736,
+}
+var expandAVX512_36SIMD_outShufHi = [8]uint64{
+ 0x3332313028222120, 0x4a4948403b3a3938, 0x616058535251504b, 0x78706b6a69686362,
+ 0x29262524237b7a79, 0x3f3e3d3c37363534, 0x5655544f4e4d4c41, 0x6d6c676665645957,
+}
+
+func expandAVX512_36SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_36SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_36SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_36SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_36SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_36SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_36SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_36SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_36SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_40SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_40SIMD_inShuf0 = [8]uint64{
+ 0x0101010000000000, 0x0101010000000000, 0x0101010000000000, 0x0101010000000000,
+ 0x0101010000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000,
+}
+var expandAVX512_40SIMD_mat1 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040,
+}
+var expandAVX512_40SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101,
+ 0xffffffffffffff01, 0xffff020202020201, 0x0202020101010101, 0x0202020101010101,
+}
+var expandAVX512_40SIMD_mat2 = [8]uint64{
+ 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0404040404040404,
+ 0x0808080808080808, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_40SIMD_inShuf2 = [8]uint64{
+ 0x0202020101010101, 0x0303030202020202, 0x0303030202020202, 0xffffff0202020202,
+ 0xffffff0202020202, 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffff0202,
+}
+var expandAVX512_40SIMD_mat3 = [8]uint64{
+ 0x0101010101010101, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_40SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffff0303, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_40SIMD_outShufLo = [8]uint64{
+ 0x0a09080403020100, 0x1814131211100c0b, 0x232221201c1b1a19, 0x31302c2b2a292824,
+ 0x3c3b3a3938343332, 0x0f0e0d4140070605, 0x1d51501716154948, 0x6027262559581f1e,
+}
+var expandAVX512_40SIMD_outShufHi0 = [8]uint64{
+ 0x3938343332313028, 0x44434241403c3b3a, 0x5251504c4b4a4948, 0x605c5b5a59585453,
+ 0x2c2b2a2964636261, 0x3e3d69683736352d, 0x797847464571703f, 0x575655ffff4f4e4d,
+}
+var expandAVX512_40SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffff0100ffffff,
+}
+
+func expandAVX512_40SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_40SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_40SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_40SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xe7ffffffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x1800000000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_44SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_44SIMD_inShuf0 = [8]uint64{
+ 0x0101010000000000, 0xffffffffffff0100, 0x0101010000000000, 0x0101010000000000,
+ 0xffffffffffff0100, 0x0101010000000000, 0xffffff0000000000, 0xffffffffffffff00,
+}
+var expandAVX512_44SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+}
+var expandAVX512_44SIMD_inShuf1 = [8]uint64{
+ 0xffffff0000000000, 0xffffff0000000000, 0xffffffffffffff00, 0xffffff0000000000,
+ 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xff02020202020101,
+}
+var expandAVX512_44SIMD_mat2 = [8]uint64{
+ 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040,
+ 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202,
+}
+var expandAVX512_44SIMD_inShuf2 = [8]uint64{
+ 0x0202020101010101, 0xffffffffffff0201, 0x0202020101010101, 0x0202020101010101,
+ 0xffffffffffff0201, 0xffff020101010101, 0xffffff0202020202, 0xffffffffffffff02,
+}
+var expandAVX512_44SIMD_mat3 = [8]uint64{
+ 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x1010101010101010,
+ 0x2020202020202020, 0x4040404040404040, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_44SIMD_inShuf3 = [8]uint64{
+ 0xffffff0202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffff0202,
+ 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_44SIMD_outShufLo = [8]uint64{
+ 0x1110080403020100, 0x1c1b1a1918141312, 0x31302c2b2a292820, 0x4342414038343332,
+ 0x58504c4b4a494844, 0x600706055c5b5a59, 0x1d69681716150961, 0x2f2e2d2171701f1e,
+}
+var expandAVX512_44SIMD_outShufHi0 = [8]uint64{
+ 0x4844434241403938, 0x5a59585453525150, 0x6c6b6a6968605c5b, 0xffff787473727170,
+ 0xffffffffffffffff, 0x46453e3d3c3b3aff, 0xff57565549ffff47, 0x6d61ffff5f5e5dff,
+}
+var expandAVX512_44SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x0100ffffffffffff,
+ 0x0c0b0a0908040302, 0xffffffffffffff10, 0x20ffffffff1918ff, 0xffff2928ffffff21,
+}
+
+func expandAVX512_44SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_44SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_44SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_44SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xce79fe003fffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x318601ffc0000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_48SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_48SIMD_inShuf0 = [8]uint64{
+ 0x0101000000000000, 0x0101000000000000, 0x0101000000000000, 0xffff000000000000,
+ 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000,
+}
+var expandAVX512_48SIMD_mat1 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040404040404,
+ 0x0808080808080808, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040,
+}
+var expandAVX512_48SIMD_inShuf1 = [8]uint64{
+ 0xffffffff01010101, 0xffffffff01010101, 0xffffffffffff0101, 0x0202020202020101,
+ 0x0202010101010101, 0x0202010101010101, 0x0202010101010101, 0xffff010101010101,
+}
+var expandAVX512_48SIMD_mat2 = [8]uint64{
+ 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0808080808080808,
+ 0x1010101010101010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_48SIMD_inShuf2 = [8]uint64{
+ 0xffff010101010101, 0xffff020202020202, 0xffff020202020202, 0xffffffff02020202,
+ 0xffffffff02020202, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_48SIMD_outShufLo = [8]uint64{
+ 0x0908050403020100, 0x131211100d0c0b0a, 0x1d1c1b1a19181514, 0x2928252423222120,
+ 0x333231302d2c2b2a, 0x3d3c3b3a39383534, 0x0f0e434241400706, 0x515017164b4a4948,
+}
+var expandAVX512_48SIMD_outShufHi = [8]uint64{
+ 0x2524232221201918, 0x31302d2c2b2a2928, 0x3b3a393835343332, 0x4544434241403d3c,
+ 0x51504d4c4b4a4948, 0x1d1c1b1a55545352, 0x5b5a595827261f1e, 0x3736636261602f2e,
+}
+
+func expandAVX512_48SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_48SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_48SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_48SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_48SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_48SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_48SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_48SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_48SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_52SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_52SIMD_inShuf0 = [8]uint64{
+ 0x0101000000000000, 0xffffffffffff0100, 0x0101000000000000, 0xffff000000000000,
+ 0xffffffffffffff00, 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00,
+}
+var expandAVX512_52SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0202020202020202, 0x0202020202020202, 0x0404040404040404,
+}
+var expandAVX512_52SIMD_inShuf1 = [8]uint64{
+ 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00, 0xffff000000000000,
+ 0xffffffff01010101, 0xffffffffff010101, 0xff02020202020201, 0x0202010101010101,
+}
+var expandAVX512_52SIMD_mat2 = [8]uint64{
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+}
+var expandAVX512_52SIMD_inShuf2 = [8]uint64{
+ 0xffffffffffff0201, 0x0202010101010101, 0xffff010101010101, 0xffffffffffffff01,
+ 0xffff010101010101, 0xffff010101010101, 0xffffffffffffff01, 0xffff010101010101,
+}
+var expandAVX512_52SIMD_mat3 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0404040404040404, 0x0808080808080808,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_52SIMD_inShuf3 = [8]uint64{
+ 0xffff020202020202, 0xffffffffffffff02, 0xffffffff02020202, 0xffffffffffff0202,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_52SIMD_outShufLo = [8]uint64{
+ 0x1008050403020100, 0x1a19181514131211, 0x2b2a2928201d1c1b, 0x3534333231302d2c,
+ 0x4845444342414038, 0x5958504d4c4b4a49, 0x616007065d5c5b5a, 0x6a69681716096362,
+}
+var expandAVX512_52SIMD_outShufHi0 = [8]uint64{
+ 0x403d3c3b3a393830, 0x51504d4c4b4a4948, 0x6261605855545352, 0x6c6b6a6968656463,
+ 0x7d7c7b7a7978706d, 0x31ffffffffffffff, 0xff3f3e3635343332, 0xffff4f4e41ffffff,
+}
+var expandAVX512_52SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xff08050403020100, 0x10ffffffffffffff, 0x1918ffffff131211,
+}
+
+func expandAVX512_52SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_52SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_52SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_52SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0x387f80ffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0xc7807f0000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_56SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_56SIMD_inShuf0 = [8]uint64{
+ 0x0100000000000000, 0x0100000000000000, 0xff00000000000000, 0xff00000000000000,
+ 0xff00000000000000, 0xff00000000000000, 0xff00000000000000, 0xff00000000000000,
+}
+var expandAVX512_56SIMD_inShuf1 = [8]uint64{
+ 0xffff010101010101, 0x0202010101010101, 0x0201010101010101, 0xff01010101010101,
+ 0xff01010101010101, 0xff01010101010101, 0xff01010101010101, 0xff01010101010101,
+}
+var expandAVX512_56SIMD_mat2 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_56SIMD_inShuf2 = [8]uint64{
+ 0xff02020202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_56SIMD_outShufLo = [8]uint64{
+ 0x0806050403020100, 0x11100e0d0c0b0a09, 0x1a19181615141312, 0x232221201e1d1c1b,
+ 0x2c2b2a2928262524, 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0f45444342414007,
+}
+var expandAVX512_56SIMD_outShufHi = [8]uint64{
+ 0x11100d0c0b0a0908, 0x1a19181615141312, 0x232221201e1d1c1b, 0x2c2b2a2928262524,
+ 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0e46454443424140, 0x50174c4b4a49480f,
+}
+
+func expandAVX512_56SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_56SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_56SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_56SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_56SIMD_mat2).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_56SIMD_inShuf2).AsUint8x64()
+ v12 := simd.LoadUint64x8(&expandAVX512_56SIMD_outShufLo).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_56SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v10 := v0.Permute(v9)
+ v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0)
+ v13 := v4.Permute2(v7, v12)
+ v15 := v7.Permute2(v11, v14)
+ return v13.AsUint64x8(), v15.AsUint64x8()
+}
+
+var expandAVX512_60SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_60SIMD_inShuf0 = [8]uint64{
+ 0x0100000000000000, 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000,
+ 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00,
+}
+var expandAVX512_60SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0101010101010101, 0x0101010102020202, 0x0202020202020202,
+}
+var expandAVX512_60SIMD_inShuf1 = [8]uint64{
+ 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00, 0xff00000000000000,
+ 0xffffffffff010101, 0x0202020202010101, 0xffffffffffff0201, 0xff01010101010101,
+}
+var expandAVX512_60SIMD_mat2 = [8]uint64{
+ 0x0404040404040404, 0x0404040408080808, 0x0808080808080808, 0x1010101010101010,
+ 0x1010101020202020, 0x2020202020202020, 0x4040404040404040, 0x4040404080808080,
+}
+var expandAVX512_60SIMD_inShuf2 = [8]uint64{
+ 0xff01010101010101, 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101,
+ 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101, 0xffffffffffffff01,
+}
+var expandAVX512_60SIMD_mat3 = [8]uint64{
+ 0x8080808080808080, 0x0101010101010101, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_60SIMD_inShuf3 = [8]uint64{
+ 0xff01010101010101, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_60SIMD_outShufLo = [8]uint64{
+ 0x0806050403020100, 0x1816151413121110, 0x28201e1d1c1b1a19, 0x31302e2d2c2b2a29,
+ 0x4140383635343332, 0x4a49484645444342, 0x5a5958504e4d4c4b, 0x626160075e5d5c5b,
+}
+var expandAVX512_60SIMD_outShufHi0 = [8]uint64{
+ 0x3b3a3938302a2928, 0x44434241403e3d3c, 0x5453525150484645, 0x5d5c5b5a59585655,
+ 0x6d6c6b6a6968605e, 0x767574737271706e, 0xffffffffffffff78, 0x31ffff2f2e2d2c2b,
+}
+var expandAVX512_60SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x06050403020100ff, 0xff0908ffffffffff,
+}
+
+func expandAVX512_60SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_60SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_60SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_60SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0x9f01ffffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x60fe000000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_64SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_64SIMD_inShuf0 = [8]uint64{
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_64SIMD_inShuf1 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101,
+ 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101,
+}
+var expandAVX512_64SIMD_outShufLo = [8]uint64{
+ 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x1716151413121110, 0x1f1e1d1c1b1a1918,
+ 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x3736353433323130, 0x3f3e3d3c3b3a3938,
+}
+
+func expandAVX512_64SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_64SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_64SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_64SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_64SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
diff --git a/src/internal/runtime/gc/scan/mkexpanders.go b/src/internal/runtime/gc/scan/mkexpanders.go
new file mode 100644
index 0000000..bbfcb37
--- /dev/null
+++ b/src/internal/runtime/gc/scan/mkexpanders.go
@@ -0,0 +1,625 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is a fork of mkasm.go, instead of generating
+// assemblies, this file generates Go code using the simd
+// package via GOEXPERIMENT.
+
+//go:build ignore
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "go/format"
+ "log"
+ "os"
+ "slices"
+ "strconv"
+ "text/template"
+ "unsafe"
+
+ "internal/runtime/gc"
+)
+
+var simdTemplate = template.Must(template.New("template").Parse(`
+{{- define "header"}}
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+ "simd"
+ "unsafe"
+)
+{{- end}}
+{{- define "expandersList"}}
+var gcExpandersAVX512SIMD = [{{.NumFuncs}}]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+{{- range .Funcs}}
+ {{.}},
+{{- end}}
+}
+{{- end}}
+
+{{- define "expanderData"}}
+var {{.Name}} = [8]uint64{
+{{.Vals}}
+}
+{{- end}}
+
+{{- define "expander"}}
+func {{.Name}}(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ {{- .BodyLoad}}
+ {{- .Body}}
+}
+{{- end}}
+`))
+
+type expandersListData struct {
+ NumFuncs int
+ Funcs []string
+}
+
+type expanderDataData struct {
+ Name string
+ Vals string
+}
+
+type expanderData struct {
+ Name string
+ BodyLoad string
+ Body string
+ data []expanderDataData
+ dataV2N map[string]string
+ uint8x64Cnt int
+ mask8x64Cnt int
+ uint64Cnt int
+}
+
+func main() {
+ generate("expanders_amd64.go", genExpanders)
+}
+
+func generate(fileName string, genFunc func(*bytes.Buffer)) {
+ var buf bytes.Buffer
+ genFunc(&buf)
+ f, err := os.Create(fileName)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ b, err := format.Source(buf.Bytes())
+ if err != nil {
+ log.Printf(string(buf.Bytes()))
+ log.Fatal(err)
+ }
+ _, err = f.Write(b)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func genExpanders(buffer *bytes.Buffer) {
+ if err := simdTemplate.ExecuteTemplate(buffer, "header", nil); err != nil {
+ panic(fmt.Errorf("failed to execute header template: %w", err))
+ }
+ gcExpandersAVX512 := make([]expanderData, len(gc.SizeClassToSize))
+ for sc, ob := range gc.SizeClassToSize {
+ if gc.SizeClassToNPages[sc] != 1 {
+ // These functions all produce a bitmap that covers exactly one
+ // page.
+ continue
+ }
+ if ob > gc.MinSizeForMallocHeader {
+ // This size class is too big to have a packed pointer/scalar bitmap.
+ break
+ }
+
+ xf := int(ob) / 8
+ log.Printf("size class %d bytes, expansion %dx", ob, xf)
+
+ fn := expanderData{Name: fmt.Sprintf("expandAVX512_%dSIMD", xf), dataV2N: make(map[string]string)}
+
+ if xf == 1 {
+ fn.expandIdentity()
+ } else {
+ ok := gfExpander(xf, &fn)
+ if !ok {
+ log.Printf("failed to generate expander for size class %d", sc)
+ }
+ }
+ gcExpandersAVX512[sc] = fn
+ }
+ // Fill in the expanders data first
+ eld := expandersListData{len(gcExpandersAVX512), make([]string, len(gcExpandersAVX512))}
+ for i, gce := range gcExpandersAVX512 {
+ if gce.Name == "" {
+ eld.Funcs[i] = "nil"
+ } else {
+ eld.Funcs[i] = gce.Name
+ }
+ }
+ if err := simdTemplate.ExecuteTemplate(buffer, "expandersList", eld); err != nil {
+ panic(fmt.Errorf("failed to execute expandersList template: %w", err))
+ }
+ // List out the expander functions and their data
+ for _, gce := range gcExpandersAVX512 {
+ if gce.Name == "" {
+ continue
+ }
+ for _, data := range gce.data {
+ if err := simdTemplate.ExecuteTemplate(buffer, "expanderData", data); err != nil {
+ panic(fmt.Errorf("failed to execute expanderData template: %w", err))
+ }
+ }
+ if err := simdTemplate.ExecuteTemplate(buffer, "expander", gce); err != nil {
+ panic(fmt.Errorf("failed to execute expander template: %w", err))
+ }
+ }
+}
+
+// mat8x8 is an 8x8 bit matrix.
+type mat8x8 struct {
+ mat [8]uint8
+}
+
+func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
+ var out [8]uint64
+ for i, mat := range mats {
+ for j, row := range mat.mat {
+ // For some reason, Intel flips the rows.
+ out[i] |= uint64(row) << ((7 - j) * 8)
+ }
+ }
+ return out
+}
+
+func (fn *expanderData) newVec() string {
+ v := fmt.Sprintf("v%d", fn.uint8x64Cnt)
+ fn.uint8x64Cnt++
+ return v
+}
+
+func (fn *expanderData) newMask() string {
+ v := fmt.Sprintf("m%d", fn.mask8x64Cnt)
+ fn.mask8x64Cnt++
+ return v
+}
+
+func (fn *expanderData) newU() string {
+ v := fmt.Sprintf("u%d", fn.uint64Cnt)
+ fn.uint64Cnt++
+ return v
+}
+
+// expandIdentity implements 1x expansion (that is, no expansion).
+func (fn *expanderData) expandIdentity() {
+ fn.Body = `
+ x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src)+64))).AsUint8x64()
+ return x.AsUint64x8(), y.AsUint64x8()`
+}
+
+func (fn *expanderData) loadSrcAsUint8x64() string {
+ v := fn.newVec()
+ fn.BodyLoad += fmt.Sprintf("%s := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()\n", v)
+ return v
+}
+
+func (fn *expanderData) loadGlobalArrAsUint8x64(arrName string) string {
+ v := fn.newVec()
+ fn.BodyLoad += fmt.Sprintf("%s := simd.LoadUint64x8(&%s).AsUint8x64()\n", v, arrName)
+ return v
+}
+
+func (fn *expanderData) permuteUint8x64(data, indices string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute(%s)\n", v, data, indices)
+ return v
+}
+
+func (fn *expanderData) permute2Uint8x64(x, y, indices string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute2(%s, %s)\n", v, x, y, indices)
+ return v
+}
+
+func (fn *expanderData) permuteMaskedUint8x64(data, indices, mask string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute(%s).Masked(%s)\n", v, data, indices, mask)
+ return v
+}
+
+func (fn *expanderData) permute2MaskedUint8x64(x, y, indices, mask string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute2(%s, %s).Masked(%s)\n", v, x, y, indices, mask)
+ return v
+}
+
+func (fn *expanderData) galoisFieldAffineTransformUint8x64(data, matrix string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix)
+ return v
+}
+
+func (fn *expanderData) returns(x, y string) {
+ fn.Body += fmt.Sprintf("return %s.AsUint64x8(), %s.AsUint64x8()", x, y)
+}
+
+func uint8x64Data(data [64]uint8) string {
+ res := ""
+ for i := range 8 {
+ ptr64 := (*uint64)(unsafe.Pointer(&data[i*8]))
+ res += fmt.Sprintf("%#016x,", *ptr64)
+ if i == 3 {
+ res += "\n"
+ }
+ }
+ return res
+}
+
+func uint64x8Data(data [8]uint64) string {
+ res := ""
+ for i := range 8 {
+ res += fmt.Sprintf("%#016x,", data[i])
+ if i == 3 {
+ res += "\n"
+ }
+ }
+ return res
+}
+
+func (fn *expanderData) loadGlobalUint8x64(name string, data [64]uint8) string {
+ val := uint8x64Data(data)
+ if n, ok := fn.dataV2N[val]; !ok {
+ fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+ fn.data = append(fn.data, expanderDataData{fullName, val})
+ v := fn.loadGlobalArrAsUint8x64(fullName)
+ fn.dataV2N[val] = v
+ return v
+ } else {
+ return n
+ }
+}
+
+func (fn *expanderData) loadGlobalUint64x8(name string, data [8]uint64) string {
+ val := uint64x8Data(data)
+ if n, ok := fn.dataV2N[val]; !ok {
+ fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+ fn.data = append(fn.data, expanderDataData{fullName, val})
+ v := fn.loadGlobalArrAsUint8x64(fullName)
+ fn.dataV2N[val] = v
+ return v
+ } else {
+ return n
+ }
+}
+
+func (fn *expanderData) mask8x64FromBits(data uint64) string {
+ v1 := fn.newU()
+ v2 := fn.newMask()
+ fn.Body += fmt.Sprintf("%s := uint64(%#x)\n%s := simd.Mask8x64FromBits(%s)\n",
+ v1, data, v2, v1)
+ return v2
+}
+
+func (fn *expanderData) orUint8x64(x, y string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Or(%s)\n", v, x, y)
+ return v
+}
+
+// gfExpander produces a function that expands each bit in an input bitmap into
+// f consecutive bits in an output bitmap.
+//
+// The input is
+//
+// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
+//
+// The output is
+//
+// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
+// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
+//
+// TODO(austin): This should Z0/Z1.
+func gfExpander(f int, fn *expanderData) bool {
+ // TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
+
+ // TODO(austin): For f >= 8, I suspect there are better ways to do this.
+ //
+ // For example, we could use a mask expansion to get a full byte for each
+ // input bit, and separately create the bytes that blend adjacent bits, then
+ // shuffle those bytes together. Certainly for f >= 16 this makes sense
+ // because each of those bytes will be used, possibly more than once.
+
+ objBits := fn.loadSrcAsUint8x64()
+
+ type term struct {
+ iByte, oByte int
+ mat mat8x8
+ }
+ var terms []term
+
+ // Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
+ // the output byte from the appropriate input byte. Gather all of these into
+ // "terms".
+ for oByte := 0; oByte < 1024/8; oByte++ {
+ var byteMat mat8x8
+ iByte := -1
+ for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
+ iBit := oBit / f
+ if iByte == -1 {
+ iByte = iBit / 8
+ } else if iByte != iBit/8 {
+ log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
+ return false
+ }
+ // One way to view this is that the i'th row of the matrix will be
+ // ANDed with the input byte, and the parity of the result will set
+ // the i'th bit in the output. We use a simple 1 bit mask, so the
+ // parity is irrelevant beyond selecting out that one bit.
+ byteMat.mat[oBit%8] = 1 << (iBit % 8)
+ }
+ terms = append(terms, term{iByte, oByte, byteMat})
+ }
+
+ if false {
+ // Print input byte -> output byte as a matrix
+ maxIByte, maxOByte := 0, 0
+ for _, term := range terms {
+ maxIByte = max(maxIByte, term.iByte)
+ maxOByte = max(maxOByte, term.oByte)
+ }
+ iToO := make([][]rune, maxIByte+1)
+ for i := range iToO {
+ iToO[i] = make([]rune, maxOByte+1)
+ }
+ matMap := make(map[mat8x8]int)
+ for _, term := range terms {
+ i, ok := matMap[term.mat]
+ if !ok {
+ i = len(matMap)
+ matMap[term.mat] = i
+ }
+ iToO[term.iByte][term.oByte] = 'A' + rune(i)
+ }
+ for o := range maxOByte + 1 {
+ fmt.Printf("%d", o)
+ for i := range maxIByte + 1 {
+ fmt.Printf(",")
+ if mat := iToO[i][o]; mat != 0 {
+ fmt.Printf("%c", mat)
+ }
+ }
+ fmt.Println()
+ }
+ }
+
+ // In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
+ // and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
+ //
+ // abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
+ // mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7
+
+ // Group the terms by matrix, but limit each group to 8 terms.
+ const termsPerGroup = 8 // Number of terms we can multiply by the same matrix.
+ const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
+
+ matMap := make(map[mat8x8]int)
+ allMats := make(map[mat8x8]bool)
+ var termGroups [][]term
+ for _, term := range terms {
+ allMats[term.mat] = true
+
+ i, ok := matMap[term.mat]
+ if ok && f > groupsPerSuperGroup {
+ // The output is ultimately produced in two [64]uint8 registers.
+ // Getting every byte in the right place of each of these requires a
+ // final permutation that often requires more than one source.
+ //
+ // Up to 8x expansion, we can get a really nice grouping so we can use
+ // the same 8 matrix vector several times, without producing
+ // permutations that require more than two sources.
+ //
+ // Above 8x, however, we can't get nice matrixes anyway, so we
+ // instead prefer reducing the complexity of the permutations we
+ // need to produce the final outputs. To do this, avoid grouping
+ // together terms that are split across the two registers.
+ outRegister := termGroups[i][0].oByte / 64
+ if term.oByte/64 != outRegister {
+ ok = false
+ }
+ }
+ if !ok {
+ // Start a new term group.
+ i = len(termGroups)
+ matMap[term.mat] = i
+ termGroups = append(termGroups, nil)
+ }
+
+ termGroups[i] = append(termGroups[i], term)
+
+ if len(termGroups[i]) == termsPerGroup {
+ // This term group is full.
+ delete(matMap, term.mat)
+ }
+ }
+
+ for i, termGroup := range termGroups {
+ log.Printf("term group %d:", i)
+ for _, term := range termGroup {
+ log.Printf(" %+v", term)
+ }
+ }
+
+ // We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
+ // as many term groups as we can into each super-group to minimize the
+ // number of matrix multiplies.
+ //
+ // Ideally, we use the same matrix in each super-group, which might mean
+ // doing fewer than 8 multiplies at a time. That's fine because it never
+ // increases the total number of matrix multiplies.
+ //
+ // TODO: Packing the matrixes less densely may let us use more broadcast
+ // loads instead of general permutations, though. That replaces a load of
+ // the permutation with a load of the matrix, but is probably still slightly
+ // better.
+ var sgSize, nSuperGroups int
+ oneMatVec := f <= groupsPerSuperGroup
+ if oneMatVec {
+ // We can use the same matrix in each multiply by doing sgSize
+ // multiplies at a time.
+ sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
+ nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
+ } else {
+ // We can't use the same matrix for each multiply. Just do as many at a
+ // time as we can.
+ //
+ // TODO: This is going to produce several distinct matrixes, when we
+ // probably only need two. Be smarter about how we create super-groups
+ // in this case. Maybe we build up an array of super-groups and then the
+ // loop below just turns them into ops?
+ sgSize = 8
+ nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
+ }
+
+ // Construct each super-group.
+ var matGroup [8]mat8x8
+ var matMuls []string
+ var perm [128]int
+ for sgi := range nSuperGroups {
+ var iperm [64]uint8
+ for i := range iperm {
+ iperm[i] = 0xff // "Don't care"
+ }
+ // Pick off sgSize term groups.
+ superGroup := termGroups[:min(len(termGroups), sgSize)]
+ termGroups = termGroups[len(superGroup):]
+ // Build the matrix and permutations for this super-group.
+ var thisMatGroup [8]mat8x8
+ for i, termGroup := range superGroup {
+ // All terms in this group have the same matrix. Pick one.
+ thisMatGroup[i] = termGroup[0].mat
+ for j, term := range termGroup {
+ // Build the input permutation.
+ iperm[i*termsPerGroup+j] = uint8(term.iByte)
+ // Build the output permutation.
+ perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
+ }
+ }
+ log.Printf("input permutation %d: %v", sgi, iperm)
+
+ // Check that we're not making more distinct matrixes than expected.
+ if oneMatVec {
+ if sgi == 0 {
+ matGroup = thisMatGroup
+ } else if matGroup != thisMatGroup {
+ log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
+ return false
+ }
+ }
+
+ // Emit matrix op.
+ matConst :=
+ fn.loadGlobalUint64x8(fmt.Sprintf("mat%d", sgi),
+ matGroupToVec(&thisMatGroup))
+ inShufConst :=
+ fn.loadGlobalUint8x64(fmt.Sprintf("inShuf%d", sgi),
+ iperm)
+ inOp := fn.permuteUint8x64(objBits, inShufConst)
+ matMul := fn.galoisFieldAffineTransformUint8x64(inOp, matConst)
+ matMuls = append(matMuls, matMul)
+ }
+
+ log.Printf("output permutation: %v", perm)
+
+ outLo, ok := genShuffle(fn, "outShufLo", (*[64]int)(perm[:64]), matMuls...)
+ if !ok {
+ log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+ return false
+ }
+ outHi, ok := genShuffle(fn, "outShufHi", (*[64]int)(perm[64:]), matMuls...)
+ if !ok {
+ log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+ return false
+ }
+ fn.returns(outLo, outHi)
+
+ return true
+}
+
+func genShuffle(fn *expanderData, name string, perm *[64]int, args ...string) (string, bool) {
+ // Construct flattened permutation.
+ var vperm [64]byte
+
+ // Get the inputs used by this permutation.
+ var inputs []int
+ for i, src := range perm {
+ inputIdx := slices.Index(inputs, src/64)
+ if inputIdx == -1 {
+ inputIdx = len(inputs)
+ inputs = append(inputs, src/64)
+ }
+ vperm[i] = byte(src%64 | (inputIdx << 6))
+ }
+
+ // Emit instructions for easy cases.
+ switch len(inputs) {
+ case 1:
+ constOp := fn.loadGlobalUint8x64(name, vperm)
+ return fn.permuteUint8x64(args[inputs[0]], constOp), true
+ case 2:
+ constOp := fn.loadGlobalUint8x64(name, vperm)
+ return fn.permute2Uint8x64(args[inputs[0]], args[inputs[1]], constOp), true
+ }
+
+ // Harder case, we need to shuffle in from up to 2 more tables.
+ //
+ // Perform two shuffles. One shuffle will get its data from the first
+ // two inputs, the other shuffle will get its data from the other one
+ // or two inputs. All values they don't care each don't care about will
+ // be zeroed.
+ var vperms [2][64]byte
+ var masks [2]uint64
+ for j, idx := range vperm {
+ for i := range vperms {
+ vperms[i][j] = 0xff // "Don't care"
+ }
+ if idx == 0xff {
+ continue
+ }
+ vperms[idx/128][j] = idx % 128
+ masks[idx/128] |= uint64(1) << j
+ }
+
+ // Validate that the masks are fully disjoint.
+ if masks[0]^masks[1] != ^uint64(0) {
+ panic("bad shuffle!")
+ }
+
+ // Generate constants.
+ constOps := make([]string, len(vperms))
+ for i, v := range vperms {
+ constOps[i] = fn.loadGlobalUint8x64(name+strconv.Itoa(i), v)
+ }
+
+ // Generate shuffles.
+ switch len(inputs) {
+ case 3:
+ r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+ r1 := fn.permuteMaskedUint8x64(args[inputs[2]], constOps[1], fn.mask8x64FromBits(masks[1]))
+ return fn.orUint8x64(r0, r1), true
+ case 4:
+ r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+ r1 := fn.permute2MaskedUint8x64(args[inputs[2]], args[inputs[3]], constOps[1], fn.mask8x64FromBits(masks[1]))
+ return fn.orUint8x64(r0, r1), true
+ }
+
+ // Too many inputs. To support more, we'd need to separate tables much earlier.
+ // Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
+ return args[0], false
+}
diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go
index 2ac181f..85ef4ea 100644
--- a/src/internal/runtime/gc/scan/scan_amd64.go
+++ b/src/internal/runtime/gc/scan/scan_amd64.go
@@ -5,13 +5,23 @@
package scan
import (
+ "internal/abi"
"internal/cpu"
"internal/runtime/gc"
+ "math/bits"
+ "simd"
"unsafe"
)
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
if CanAVX512() {
+ return ScanSpanPackedAVX512SIMD(mem, bufp, objMarks, sizeClass, ptrMask)
+ }
+ panic("not implemented")
+}
+
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ if CanAVX512() {
return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
}
panic("not implemented")
@@ -34,6 +44,64 @@
//go:noescape
func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
+func FilterNilSIMD(bufp *uintptr, n int32) (cnt int32) {
+ scanned := 0
+ buf := unsafe.Slice((*int64)(unsafe.Pointer(bufp)), int(n))
+ // Use the widest vector
+ var plainZeros simd.Int64x8
+ for ; scanned+8 <= int(n); scanned += 8 {
+ v := simd.LoadInt64x8Slice(buf[scanned:])
+ m := v.NotEqual(plainZeros)
+ v.Compress(m).StoreSlice(buf[cnt:])
+ // Count the mask bits
+ mbits := uint64(m.ToBits())
+ mbits &= 0xFF // Only the lower 8 bits are meaningful.
+ nonNilCnt := bits.OnesCount64(mbits)
+ cnt += int32(nonNilCnt)
+ }
+ // Scalar code to clean up tails.
+ for i := scanned; i < int(n); i++ {
+ if buf[i] != 0 {
+ buf[cnt] = buf[i]
+ cnt++
+ }
+ }
+ return
+}
+
+func ScanSpanPackedAVX512SIMD(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ return FilterNilSIMD(bufp, scanSpanPackedAVX512SIMD(mem, bufp, objMarks, sizeClass, ptrMask))
+}
+
+func scanSpanPackedAVX512SIMD(mem unsafe.Pointer, buf *uintptr, objDarts *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ m1, m2 := gcExpandersAVX512SIMD[sizeClass](abi.NoEscape(unsafe.Pointer(objDarts)))
+ ptrm := unsafe.Pointer(ptrMask)
+ m3 := simd.LoadUint64x8((*[8]uint64)(ptrm))
+ m4 := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(ptrm) + 64)))
+
+ masks := [128]uint8{}
+ counts := [128]uint8{}
+ m1m3 := m1.And(m3).AsUint8x64()
+ m2m4 := m2.And(m4).AsUint8x64()
+ m1m3.Store((*[64]uint8)(unsafe.Pointer(&masks[0])))
+ m2m4.Store((*[64]uint8)(unsafe.Pointer(&masks[64])))
+ m1m3.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[0])))
+ m2m4.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[64])))
+
+ for i := range 128 {
+ mv := masks[i]
+ if mv == 0 {
+ continue
+ }
+ m := simd.Mask64x8FromBits(mv)
+ ptrs := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(mem) + uintptr(i*64))))
+ ptrs.Compress(m).Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(count*8))))
+ count += int32(counts[i])
+ }
+ simd.ClearAVXUpperBits()
+ return
+}
+
var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
cpu.X86.HasAVX512BW &&
cpu.X86.HasGFNI &&
diff --git a/src/internal/runtime/gc/scan/scan_amd64_test.go b/src/internal/runtime/gc/scan/scan_amd64_test.go
index a914b4f..ee1d13c 100644
--- a/src/internal/runtime/gc/scan/scan_amd64_test.go
+++ b/src/internal/runtime/gc/scan/scan_amd64_test.go
@@ -17,3 +17,10 @@
}
testScanSpanPacked(t, scan.ScanSpanPackedAVX512)
}
+
+func TestScanSpanPackedAVX512SIMD(t *testing.T) {
+ if !scan.CanAVX512() {
+ t.Skip("no AVX512")
+ }
+ testScanSpanPacked(t, scan.ScanSpanPackedAVX512SIMD)
+}
diff --git a/src/internal/runtime/gc/scan/scan_generic.go b/src/internal/runtime/gc/scan/scan_generic.go
index a4d5182..2ba0e95 100644
--- a/src/internal/runtime/gc/scan/scan_generic.go
+++ b/src/internal/runtime/gc/scan/scan_generic.go
@@ -21,3 +21,6 @@
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
}
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
+}
diff --git a/src/internal/runtime/gc/scan/scan_test.go b/src/internal/runtime/gc/scan/scan_test.go
index 14a0f6f..fc5e97b 100644
--- a/src/internal/runtime/gc/scan/scan_test.go
+++ b/src/internal/runtime/gc/scan/scan_test.go
@@ -203,6 +203,13 @@
scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
+ b.Run("impl=PlatformAsm", func(b *testing.B) {
+ b.SetBytes(avgBytes)
+ for i := range b.N {
+ page := pageOrder[i%len(pageOrder)]
+ scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+ }
+ })
}
})
}
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
This CL reworked CL 688415 for the comments from @mkny...@google.com.
I added the new generator, but the naming parts are not done yet(also the GOEXPERIMENT plumbing).
This CL will be updated
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
This CL reworked CL 688415 for the comments from @mkny...@google.com.
I added the new generator, but the naming parts are not done yet(also the GOEXPERIMENT plumbing).
This CL will be updated
Done.
One note: I am not sure how to do PCALIGN in Go code, so I left a TODO there.
Apart from this as far as I know the Go SIMD code should be equivalent to Asm codes...
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
nice job! looks good overall, mostly comments about naming and adding more comments
[dev.simd] runtime/gc: add simd package based greentea kernelsinternal/runtime/gc
This CL adds a new generator to runtime/gc/scan that generates expanderinternal/runtime/gc/scan
The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
in BenchmarkScanSpanPacked.is this mostly coming from the FilterNil kernels? I imagine the rest of it should be identical.
// assemblies, this file generates Go code using the simdnit: "that uses the simd package." I don't think we need to mention the GOEXPERIMENT, it's just another comment that's going to get stale once we remove the GOEXPERIMENT.
// assemblies, this file generates Go code using the simdnit: assembly code
type expandersListData struct {nit: I think Data is redunant here. you can just call this expandersList, or even better, just "expanders."
NumFuncs intthis is redundant. can't we just use the length of Funcs? I'm pretty sure you can get a slice value's length in a template.
(better yet, perhaps this should just be `type expanders []string`?)
type expanderDataData struct {I think you can just call this 'expanderData'. AFAICT this is just for the global tables?
on that note, this could use a comment explaining what it is, and a comment on the fields explaining what they do.
type expanderData struct {maybe call this 'expanderFunc' or just 'expander'?
Name string
BodyLoad string
Body stringcomments on these fields would be very helpful.
dataV2N map[string]stringmaybe "dataByVals"?
// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
//
// The output is
//
// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
//
// TODO(austin): This should Z0/Z1.this comment needs an update, you're not selecting registers anymore
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)IMO, just make this a panic and have the tests skip if we're not on amd64. (maybe make a new global constant that indicates what platforms ScanSpanPackedAsm is supported on.)
I think the current situation is a little misleading.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
in BenchmarkScanSpanPacked.please add the benchstat output directly in the commit message, like we do for some other commits. you don't need to include all the results, a small sample is fine.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
[dev.simd] runtime/gc: add simd package based greentea kernelsJunyang Shaointernal/runtime/gc
Done
This CL adds a new generator to runtime/gc/scan that generates expanderJunyang Shaointernal/runtime/gc/scan
Done
The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
in BenchmarkScanSpanPacked.please add the benchstat output directly in the commit message, like we do for some other commits. you don't need to include all the results, a small sample is fine.
Done
The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
in BenchmarkScanSpanPacked.is this mostly coming from the FilterNil kernels? I imagine the rest of it should be identical.
I guess so. Some more context of the benchmarkings I did:
The benchmark in this CL description is comparing:
(1) FilterNilAVX512 + Go SIMD expanders/scanners
(2) FilterNil + Asm expanders/scanners
And the result is (1) is +25% on throughput compared to (2).
I did this before on the older CL:
(3) FilterNil + Go SIMD expanders/scanners
(4) FilterNil + Asm expanders/scanners
The throughput were roughly the same.
But since then we had added many optimizations to the SIMD branch, and on the older CL, (1) is only +10% to (2). Now since (2) is +25% to (1), it might be possible that (3) is also strictly better than (4).
I will run another benchmark later. :D
// assemblies, this file generates Go code using the simdnit: "that uses the simd package." I don't think we need to mention the GOEXPERIMENT, it's just another comment that's going to get stale once we remove the GOEXPERIMENT.
Done
// assemblies, this file generates Go code using the simdJunyang Shaonit: assembly code
Done
nit: I think Data is redunant here. you can just call this expandersList, or even better, just "expanders."
Done
this is redundant. can't we just use the length of Funcs? I'm pretty sure you can get a slice value's length in a template.
(better yet, perhaps this should just be `type expanders []string`?)
We probably just don't need that type, updated. Thanks!
I think you can just call this 'expanderData'. AFAICT this is just for the global tables?
on that note, this could use a comment explaining what it is, and a comment on the fields explaining what they do.
Done
maybe call this 'expanderFunc' or just 'expander'?
Done
comments on these fields would be very helpful.
Done
dataV2N map[string]stringJunyang Shaomaybe "dataByVals"?
Done
// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
//
// The output is
//
// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
//
// TODO(austin): This should Z0/Z1.this comment needs an update, you're not selecting registers anymore
Done
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)IMO, just make this a panic and have the tests skip if we're not on amd64. (maybe make a new global constant that indicates what platforms ScanSpanPackedAsm is supported on.)
I think the current situation is a little misleading.
I think both these 2 functions are guarded by `HasFastScanSpanPacked` and they will never actually be called, I made them both panic.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
"simd"I'm confused, doesn't this need a build goexperiment.simd build tag?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
I'm confused, doesn't this need a build goexperiment.simd build tag?
Ohh you might be right, maybe its's becasue simd experiment is always on in this branch so the compilation doesn't fail.
I will update...
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Junyang ShaoI'm confused, doesn't this need a build goexperiment.simd build tag?
Ohh you might be right, maybe its's becasue simd experiment is always on in this branch so the compilation doesn't fail.
I will update...
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +2 |
nit: can you rename this file to `expanders_amd64.s`? just to match your new file.
// expander is the expander function, it only provides 3 kinds of values:nit: "it only operates on 3 kinds of values" maybe?
fn.Body += fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix)this has O(n^2) behavior since we're appending to a string, forcing an O(n) copy of the string on every append. this can be more efficient, and perhaps be written more cleanly, if `fn.Body` was a `strings.Builder`. then instead of `+= fmt.Sprintf(...)` you would do `fmt.Fprintf(&fn.Body, ...)`.
this is just a generator so the performance doesn't really matter, but it does nag at my brain... 😄
if you don't feel like changing this, feel free to just acknowledge this comment and move on.
panic("should not reach")ah, sorry, I think ScanSpanPacked should still call ScapSpanPackedGo in the generic implementation. I just think the assembly one should crash.
panic("should not reach")nit: we usually write "not implemented" here
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |