[go/dev.simd] [dev.simd] runtime/gc: generate greentea expand kernels in Go SIMD

8 views
Skip to first unread message

Junyang Shao (Gerrit)

unread,
Nov 11, 2025, 1:47:27 AM (2 days ago) Nov 11
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Junyang Shao has uploaded the change for review

Commit message

[dev.simd] runtime/gc: generate greentea expand kernels in Go SIMD

This CL adds a new generator to runtime/gc/scan that generates Go codes
using the new simd package.

This CL also includes the plumbing, it will use the Go SIMD kernels if
GOEXPERIMENT=simd is on.
Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f

Change diff

diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
index 93abfd3..9cd5b099 100644
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -87,6 +87,7 @@
internal/profilerecord,
internal/trace/tracev2,
math/bits,
+ simd,
structs
< internal/bytealg
< internal/stringslite
@@ -826,7 +827,8 @@
os,
reflect,
strings,
- sync
+ sync,
+ regexp
< internal/runtime/gc/internal/gen;

regexp, internal/txtar, internal/trace, internal/trace/raw
diff --git a/src/internal/runtime/gc/scan/expand_amd64.go b/src/internal/runtime/gc/scan/expand_amd64.go
index 9bea471..c5764e3 100644
--- a/src/internal/runtime/gc/scan/expand_amd64.go
+++ b/src/internal/runtime/gc/scan/expand_amd64.go
@@ -4,7 +4,11 @@

package scan

-import "internal/runtime/gc"
+import (
+ "internal/runtime/gc"
+ "simd"
+ "unsafe"
+)

// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
// where f is the word size of objects in sizeClass.
@@ -20,3 +24,14 @@
//
// It is defined in assembly.
var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
+
+// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
+// where f is the word size of objects in sizeClass.
+//
+// This is a testing entrypoint to the expanders used by scanSpanPacked*.
+func ExpandAVX512SIMD(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
+ v1, v2 := gcExpandersAVX512SIMD[sizeClass](unsafe.Pointer(packed))
+ v1.Store((*[8]uint64)(unsafe.Pointer(unpacked)))
+ v2.Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unpacked)) + 64)))
+ simd.ClearAVXUpperBits()
+}
diff --git a/src/internal/runtime/gc/scan/expand_amd64_test.go b/src/internal/runtime/gc/scan/expand_amd64_test.go
index a8f5b88..692bc7c 100644
--- a/src/internal/runtime/gc/scan/expand_amd64_test.go
+++ b/src/internal/runtime/gc/scan/expand_amd64_test.go
@@ -17,3 +17,10 @@
}
testExpand(t, scan.ExpandAVX512)
}
+
+func TestExpandAVX512SIMD(t *testing.T) {
+ if !scan.CanAVX512() {
+ t.Skip("no AVX512")
+ }
+ testExpand(t, scan.ExpandAVX512SIMD)
+}
diff --git a/src/internal/runtime/gc/scan/expand_test.go b/src/internal/runtime/gc/scan/expand_test.go
index 692817d..2e75574 100644
--- a/src/internal/runtime/gc/scan/expand_test.go
+++ b/src/internal/runtime/gc/scan/expand_test.go
@@ -23,7 +23,7 @@

for i := range want {
if got[i] != want[i] {
- t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
+ t.Errorf("expansion differs from reference at bit %d, sizeClass=%d", i*goarch.PtrSize, sizeClass)
if goarch.PtrSize == 4 {
t.Logf("got: %032b", got[i])
t.Logf("want: %032b", want[i])
diff --git a/src/internal/runtime/gc/scan/expanders_amd64.go b/src/internal/runtime/gc/scan/expanders_amd64.go
new file mode 100644
index 0000000..ea6c643
--- /dev/null
+++ b/src/internal/runtime/gc/scan/expanders_amd64.go
@@ -0,0 +1,1528 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+ "simd"
+ "unsafe"
+)
+
+var gcExpandersAVX512SIMD = [68]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+ nil,
+ expandAVX512_1SIMD,
+ expandAVX512_2SIMD,
+ expandAVX512_3SIMD,
+ expandAVX512_4SIMD,
+ expandAVX512_6SIMD,
+ expandAVX512_8SIMD,
+ expandAVX512_10SIMD,
+ expandAVX512_12SIMD,
+ expandAVX512_14SIMD,
+ expandAVX512_16SIMD,
+ expandAVX512_18SIMD,
+ expandAVX512_20SIMD,
+ expandAVX512_22SIMD,
+ expandAVX512_24SIMD,
+ expandAVX512_26SIMD,
+ expandAVX512_28SIMD,
+ expandAVX512_30SIMD,
+ expandAVX512_32SIMD,
+ expandAVX512_36SIMD,
+ expandAVX512_40SIMD,
+ expandAVX512_44SIMD,
+ expandAVX512_48SIMD,
+ expandAVX512_52SIMD,
+ expandAVX512_56SIMD,
+ expandAVX512_60SIMD,
+ expandAVX512_64SIMD,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+}
+
+func expandAVX512_1SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src) + 64))).AsUint8x64()
+ return x.AsUint64x8(), y.AsUint64x8()
+}
+
+var expandAVX512_2SIMD_mat0 = [8]uint64{
+ 0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080,
+ 0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080,
+}
+var expandAVX512_2SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+ 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918,
+}
+var expandAVX512_2SIMD_inShuf1 = [8]uint64{
+ 0x2726252423222120, 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x2f2e2d2c2b2a2928,
+ 0x3736353433323130, 0x3736353433323130, 0x3f3e3d3c3b3a3938, 0x3f3e3d3c3b3a3938,
+}
+var expandAVX512_2SIMD_outShufLo = [8]uint64{
+ 0x0b030a0209010800, 0x0f070e060d050c04, 0x1b131a1219111810, 0x1f171e161d151c14,
+ 0x2b232a2229212820, 0x2f272e262d252c24, 0x3b333a3239313830, 0x3f373e363d353c34,
+}
+
+func expandAVX512_2SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_2SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_2SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_2SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_2SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_3SIMD_mat0 = [8]uint64{
+ 0x0101010202020404, 0x0408080810101020, 0x2020404040808080, 0x0101010202020404,
+ 0x0408080810101020, 0x2020404040808080, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_3SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3SIMD_inShuf1 = [8]uint64{
+ 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918,
+ 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3SIMD_inShuf2 = [8]uint64{
+ 0x2726252423222120, 0x2726252423222120, 0x2726252423222120, 0xffffffffff2a2928,
+ 0xffffffffff2a2928, 0xffffffffffff2928, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3SIMD_outShufLo = [8]uint64{
+ 0x0a02110901100800, 0x05140c04130b0312, 0x170f07160e06150d, 0x221a292119282018,
+ 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25, 0x4a42514941504840, 0x45544c44534b4352,
+}
+var expandAVX512_3SIMD_outShufHi = [8]uint64{
+ 0x170f07160e06150d, 0x221a292119282018, 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25,
+ 0x4a42514941504840, 0x45544c44534b4352, 0x574f47564e46554d, 0x625a696159686058,
+}
+
+func expandAVX512_3SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_3SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_3SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_3SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_3SIMD_inShuf2).AsUint8x64()
+ v11 := simd.LoadUint64x8(&expandAVX512_3SIMD_outShufLo).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_3SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v0.Permute(v8)
+ v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v12 := v4.Permute2(v7, v11)
+ v14 := v7.Permute2(v10, v13)
+ return v12.AsUint64x8(), v14.AsUint64x8()
+}
+
+var expandAVX512_4SIMD_mat0 = [8]uint64{
+ 0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080,
+ 0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080,
+}
+var expandAVX512_4SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+}
+var expandAVX512_4SIMD_inShuf1 = [8]uint64{
+ 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1716151413121110,
+ 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918,
+}
+var expandAVX512_4SIMD_outShufLo = [8]uint64{
+ 0x1911090118100800, 0x1b130b031a120a02, 0x1d150d051c140c04, 0x1f170f071e160e06,
+ 0x3931292138302820, 0x3b332b233a322a22, 0x3d352d253c342c24, 0x3f372f273e362e26,
+}
+
+func expandAVX512_4SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_4SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_4SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_4SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_4SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_6SIMD_mat0 = [8]uint64{
+ 0x0101010101010202, 0x0202020204040404, 0x0404080808080808, 0x1010101010102020,
+ 0x2020202040404040, 0x4040808080808080, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_6SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+ 0x0706050403020100, 0x0706050403020100, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6SIMD_inShuf1 = [8]uint64{
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6SIMD_inShuf2 = [8]uint64{
+ 0xffff151413121110, 0xffff151413121110, 0xffffff1413121110, 0xffffff1413121110,
+ 0xffffff1413121110, 0xffffff1413121110, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6SIMD_outShufLo = [8]uint64{
+ 0x0901282018100800, 0x1a120a0229211911, 0x2b231b130b032a22, 0x0d052c241c140c04,
+ 0x1e160e062d251d15, 0x2f271f170f072e26, 0x4941686058504840, 0x5a524a4269615951,
+}
+var expandAVX512_6SIMD_outShufHi = [8]uint64{
+ 0x2b231b130b032a22, 0x0d052c241c140c04, 0x1e160e062d251d15, 0x2f271f170f072e26,
+ 0x4941686058504840, 0x5a524a4269615951, 0x6b635b534b436a62, 0x4d456c645c544c44,
+}
+
+func expandAVX512_6SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_6SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_6SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_6SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_6SIMD_inShuf2).AsUint8x64()
+ v11 := simd.LoadUint64x8(&expandAVX512_6SIMD_outShufLo).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_6SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v0.Permute(v8)
+ v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v12 := v4.Permute2(v7, v11)
+ v14 := v7.Permute2(v10, v13)
+ return v12.AsUint64x8(), v14.AsUint64x8()
+}
+
+var expandAVX512_8SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_8SIMD_inShuf0 = [8]uint64{
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+ 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+}
+var expandAVX512_8SIMD_inShuf1 = [8]uint64{
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+ 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+}
+var expandAVX512_8SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03,
+ 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07,
+}
+
+func expandAVX512_8SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_8SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_8SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_8SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_8SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_10SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808,
+ 0x0808080808080808, 0x1010101010101010, 0x1010202020202020, 0x2020202040404040,
+}
+var expandAVX512_10SIMD_inShuf0 = [8]uint64{
+ 0xff06050403020100, 0xff06050403020100, 0xff06050403020100, 0xff06050403020100,
+ 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100,
+}
+var expandAVX512_10SIMD_mat1 = [8]uint64{
+ 0x4040404040408080, 0x8080808080808080, 0x0808080808080808, 0x1010101010101010,
+ 0x1010202020202020, 0x2020202040404040, 0x4040404040408080, 0x8080808080808080,
+}
+var expandAVX512_10SIMD_inShuf1 = [8]uint64{
+ 0xffff050403020100, 0xffff050403020100, 0xff0c0b0a09080706, 0xff0c0b0a09080706,
+ 0xff0c0b0a09080706, 0xff0c0b0a09080706, 0xffff0b0a09080706, 0xffff0b0a09080706,
+}
+var expandAVX512_10SIMD_mat2 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_10SIMD_inShuf2 = [8]uint64{
+ 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_10SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x2921191109014840, 0x1a120a0249413931, 0x0b034a423a322a22,
+ 0x4b433b332b231b13, 0x3c342c241c140c04, 0x2d251d150d054c44, 0x1e160e064d453d35,
+}
+var expandAVX512_10SIMD_outShufHi = [8]uint64{
+ 0x4840383028201810, 0x3931292119115850, 0x2a221a1259514941, 0x1b135a524a423a32,
+ 0x5b534b433b332b23, 0x4c443c342c241c14, 0x3d352d251d155c54, 0x2e261e165d554d45,
+}
+
+func expandAVX512_10SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_10SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_10SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_10SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_10SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_10SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_10SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_10SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_10SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_12SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_12SIMD_inShuf0 = [8]uint64{
+ 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100,
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+}
+var expandAVX512_12SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_12SIMD_inShuf1 = [8]uint64{
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+ 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605,
+}
+var expandAVX512_12SIMD_mat2 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+}
+var expandAVX512_12SIMD_inShuf2 = [8]uint64{
+ 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605,
+ 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706,
+}
+var expandAVX512_12SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x1911090158504840, 0x5951494139312921, 0x3a322a221a120a02,
+ 0x1b130b035a524a42, 0x5b534b433b332b23, 0x3c342c241c140c04, 0x1d150d055c544c44,
+}
+var expandAVX512_12SIMD_outShufHi = [8]uint64{
+ 0x5850484038302820, 0x3931292178706860, 0x7971696159514941, 0x5a524a423a322a22,
+ 0x3b332b237a726a62, 0x7b736b635b534b43, 0x5c544c443c342c24, 0x3d352d257c746c64,
+}
+
+func expandAVX512_12SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_12SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_12SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_12SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_12SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_12SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_12SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_12SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_12SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_14SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_14SIMD_inShuf0 = [8]uint64{
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+ 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+}
+var expandAVX512_14SIMD_mat1 = [8]uint64{
+ 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040808080808080, 0x8080808080808080, 0x1010101010102020, 0x2020202020202020,
+}
+var expandAVX512_14SIMD_inShuf1 = [8]uint64{
+ 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100,
+ 0xffffffff03020100, 0xffffffff03020100, 0xffffff0807060504, 0xffffff0807060504,
+}
+var expandAVX512_14SIMD_mat2 = [8]uint64{
+ 0x2020202040404040, 0x4040404040404040, 0x4040808080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+}
+var expandAVX512_14SIMD_inShuf2 = [8]uint64{
+ 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504,
+ 0xffffff0908070605, 0xffffff0908070605, 0xffffffff08070605, 0xffffffff08070605,
+}
+var expandAVX512_14SIMD_mat3 = [8]uint64{
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_14SIMD_inShuf3 = [8]uint64{
+ 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_14SIMD_outShufLo = [8]uint64{
+ 0x3830282018100800, 0x0901686058504840, 0x4941393129211911, 0x1a120a0269615951,
+ 0x5a524a423a322a22, 0x2b231b130b036a62, 0x6b635b534b433b33, 0x3c342c241c140c04,
+}
+var expandAVX512_14SIMD_outShufHi0 = [8]uint64{
+ 0x6860585048403830, 0x3931ffffffff7870, 0x7971696159514941, 0x4a423a32ffffffff,
+ 0xffff7a726a625a52, 0x5b534b433b33ffff, 0xffffffff7b736b63, 0x6c645c544c443c34,
+}
+var expandAVX512_14SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffff18100800ffff, 0xffffffffffffffff, 0xffffffff19110901,
+ 0x0a02ffffffffffff, 0xffffffffffff1a12, 0x1b130b03ffffffff, 0xffffffffffffffff,
+}
+
+func expandAVX512_14SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_14SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_14SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_14SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_14SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_14SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xff0ffc3ff0ffc3ff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0xf003c00f003c00)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_16SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_16SIMD_inShuf0 = [8]uint64{
+ 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000,
+ 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000,
+}
+var expandAVX512_16SIMD_inShuf1 = [8]uint64{
+ 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404,
+ 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404,
+}
+var expandAVX512_16SIMD_outShufLo = [8]uint64{
+ 0x1918111009080100, 0x3938313029282120, 0x1b1a13120b0a0302, 0x3b3a33322b2a2322,
+ 0x1d1c15140d0c0504, 0x3d3c35342d2c2524, 0x1f1e17160f0e0706, 0x3f3e37362f2e2726,
+}
+
+func expandAVX512_16SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_16SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_16SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_16SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_16SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_18SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_18SIMD_inShuf0 = [8]uint64{
+ 0x0303020201010000, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100,
+ 0xffffffff03020100, 0xffffffff03020100, 0x0303020201010000, 0xff03020201010000,
+}
+var expandAVX512_18SIMD_mat1 = [8]uint64{
+ 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040404040408080, 0x8080808080808080, 0x1010101010101010, 0x1010202020202020,
+}
+var expandAVX512_18SIMD_inShuf1 = [8]uint64{
+ 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100,
+ 0xffffffffff020100, 0xffff020201010000, 0xff06060505040403, 0xffffffff06050403,
+}
+var expandAVX512_18SIMD_mat2 = [8]uint64{
+ 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040404040408080,
+ 0x8080808080808080, 0x0101010101010101, 0x0101020202020202, 0x0202020202020202,
+}
+var expandAVX512_18SIMD_inShuf2 = [8]uint64{
+ 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403,
+ 0x0606050504040303, 0x0707060605050404, 0xffffffffff060504, 0xffffffffff060504,
+}
+var expandAVX512_18SIMD_mat3 = [8]uint64{
+ 0x0202020204040404, 0x0404040404040404, 0x0404040404040808, 0x0808080808080808,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_18SIMD_inShuf3 = [8]uint64{
+ 0xffffffffff060504, 0xffffffffff060504, 0xffffffffff060504, 0xffff060605050404,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_18SIMD_outShufLo = [8]uint64{
+ 0x3028201810080100, 0x6058504840393831, 0x2119110903026968, 0x5149413b3a333229,
+ 0x120a05046b6a6159, 0x423d3c35342a221a, 0x07066d6c625a524a, 0x3e37362b231b130b,
+}
+var expandAVX512_18SIMD_outShufHi0 = [8]uint64{
+ 0x6160585048403830, 0xffffffff78706968, 0x59514941393231ff, 0xffff79716b6a6362,
+ 0x4a423a3433ffffff, 0x7a726d6c65645a52, 0x3b3635ffffffffff, 0x6f6e67665b534b43,
+}
+var expandAVX512_18SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0x18100800ffffffff, 0xffffffffffffff19, 0x0901ffffffffffff,
+ 0xffffffffff1b1a11, 0xffffffffffffffff, 0xffffff1d1c120a02, 0xffffffffffffffff,
+}
+
+func expandAVX512_18SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_18SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_18SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_18SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_18SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_18SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xffe0fff83ffe0fff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x1f0007c001f000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_20SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_20SIMD_inShuf0 = [8]uint64{
+ 0x0303020201010000, 0xffffffff03020100, 0xff03020201010000, 0xffff020201010000,
+ 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100,
+}
+var expandAVX512_20SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x0808080808080808,
+}
+var expandAVX512_20SIMD_inShuf1 = [8]uint64{
+ 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000,
+ 0xff06060505040403, 0x0606050504040303, 0xffffffff06050403, 0xffff050504040303,
+}
+var expandAVX512_20SIMD_mat2 = [8]uint64{
+ 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040,
+ 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202,
+}
+var expandAVX512_20SIMD_inShuf2 = [8]uint64{
+ 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303,
+ 0xffffffffff050403, 0xffff050504040303, 0xffff060605050404, 0xffffffffff060504,
+}
+var expandAVX512_20SIMD_outShufLo = [8]uint64{
+ 0x2019181110080100, 0x4841403831302928, 0x1209030259585049, 0x33322b2a211b1a13,
+ 0x5b5a514b4a434239, 0x221d1c15140a0504, 0x4c45443a35342d2c, 0x160b07065d5c524d,
+}
+var expandAVX512_20SIMD_outShufHi = [8]uint64{
+ 0x4140393830292820, 0x6968605958515048, 0x312b2a2221787170, 0x5a53524943423b3a,
+ 0x237973726b6a615b, 0x45443d3c322d2c24, 0x6d6c625d5c55544a, 0x332f2e26257a7574,
+}
+
+func expandAVX512_20SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_20SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_20SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_20SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_20SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_20SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_20SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_20SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_20SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_22SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_22SIMD_inShuf0 = [8]uint64{
+ 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100,
+ 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000,
+}
+var expandAVX512_22SIMD_mat1 = [8]uint64{
+ 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040808080808080, 0x8080808080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_22SIMD_inShuf1 = [8]uint64{
+ 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000,
+ 0xffffffffff020100, 0xffffffff01010000, 0xffff040403030202, 0xffff050504040303,
+}
+var expandAVX512_22SIMD_mat2 = [8]uint64{
+ 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404,
+ 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, 0x1010101010102020,
+}
+var expandAVX512_22SIMD_inShuf2 = [8]uint64{
+ 0xffffffffff050403, 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303,
+ 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303, 0xffffffffff050403,
+}
+var expandAVX512_22SIMD_mat3 = [8]uint64{
+ 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040808080808080,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_22SIMD_inShuf3 = [8]uint64{
+ 0xffff050504040303, 0xffffffffff050403, 0xffffff0504040303, 0xffffffffffff0403,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_22SIMD_outShufLo = [8]uint64{
+ 0x2120181110080100, 0x4948403938313028, 0x0302696860595850, 0x3229232219131209,
+ 0x5a514b4a413b3a33, 0x140a05046b6a615b, 0x3c35342a25241a15, 0x625d5c524d4c423d,
+}
+var expandAVX512_22SIMD_outShufHi0 = [8]uint64{
+ 0x5049484039383130, 0x7871706968605958, 0x3332ffffffffffff, 0x5b5a514b4a413b3a,
+ 0xffff7973726b6a61, 0x3d3c3534ffffffff, 0x6c625d5c524d4c42, 0xffffffff7a75746d,
+}
+var expandAVX512_22SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffff181110080100, 0xffffffffffffffff,
+ 0x0302ffffffffffff, 0xffffffff19131209, 0xffffffffffffffff, 0x140a0504ffffffff,
+}
+
+func expandAVX512_22SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_22SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_22SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_22SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_22SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_22SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xffff03fffc0ffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0xf0000fc0003f0000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_24SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_24SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0x0202010101000000, 0x0202010101000000, 0x0202010101000000,
+ 0x0202010101000000, 0xff02010101000000, 0xffff010101000000, 0xffff010101000000,
+}
+var expandAVX512_24SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02,
+ 0xffffffffffffff02, 0x0404040303030202, 0x0404030303020202, 0x0404030303020202,
+}
+var expandAVX512_24SIMD_mat2 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x4040404040404040, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_24SIMD_inShuf2 = [8]uint64{
+ 0x0505040404030303, 0x0505040404030303, 0x0505040404030303, 0xffff040404030303,
+ 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffff04, 0xffffffffffffff05,
+}
+var expandAVX512_24SIMD_mat3 = [8]uint64{
+ 0x0202020202020202, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_24SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffffff05, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_24SIMD_outShufLo = [8]uint64{
+ 0x11100a0908020100, 0x282221201a191812, 0x3a39383231302a29, 0x14130d0c0b050403,
+ 0x2b2524231d1c1b15, 0x3d3c3b3534332d2c, 0x1716480f0e400706, 0x2e602726581f1e50,
+}
+var expandAVX512_24SIMD_outShufHi0 = [8]uint64{
+ 0x3a39383231302928, 0x51504a4948424140, 0x2a6261605a595852, 0x3d3c3b3534332c2b,
+ 0x54534d4c4b454443, 0x2d6564635d5c5b55, 0x703f3e6837362f2e, 0x5756ff4f4e784746,
+}
+var expandAVX512_24SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff00ffffffffff,
+}
+
+func expandAVX512_24SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_24SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_24SIMD_mat2).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf2).AsUint8x64()
+ v12 := simd.LoadUint64x8(&expandAVX512_24SIMD_mat3).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_24SIMD_inShuf3).AsUint8x64()
+ v16 := simd.LoadUint64x8(&expandAVX512_24SIMD_outShufLo).AsUint8x64()
+ v18 := simd.LoadUint64x8(&expandAVX512_24SIMD_outShufHi0).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_24SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v10 := v0.Permute(v9)
+ v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0)
+ v14 := v0.Permute(v13)
+ v15 := v14.GaloisFieldAffineTransform(v12.AsUint64x8(), 0)
+ v17 := v4.Permute2(v7, v16)
+ u0 := uint64(0xdfffffffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v20 := v7.Permute2(v11, v18).Masked(m0)
+ u1 := uint64(0x2000000000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v21 := v15.Permute(v19).Masked(m1)
+ v22 := v20.Or(v21)
+ return v17.AsUint64x8(), v22.AsUint64x8()
+}
+
+var expandAVX512_26SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_26SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100,
+ 0xffff020201010000, 0xffffffffff020100, 0x0202010101000000, 0xffff010101000000,
+}
+var expandAVX512_26SIMD_mat1 = [8]uint64{
+ 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040404040408080, 0x8080808080808080, 0x0101010101010101, 0x0808080808080808,
+}
+var expandAVX512_26SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffff0100, 0xffffffff01010000, 0xffffffffffff0100, 0xffffffff01010000,
+ 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0xff04040403030302,
+}
+var expandAVX512_26SIMD_mat2 = [8]uint64{
+ 0x1010101010101010, 0x1010202020202020, 0x2020202020202020, 0x2020202040404040,
+ 0x4040404040404040, 0x4040404040408080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_26SIMD_inShuf2 = [8]uint64{
+ 0x0404030303020202, 0xffffffffff040302, 0xffff040403030202, 0xffffffffff040302,
+ 0xffff040403030202, 0xffffffffff040302, 0xff04030303020202, 0xffff040404030303,
+}
+var expandAVX512_26SIMD_mat3 = [8]uint64{
+ 0x0101020202020202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404,
+ 0x0404040404040808, 0x1010101010101010, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_26SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffff0403, 0xffffffff04040303, 0xffffffffffff0403, 0xffffffff04040303,
+ 0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_26SIMD_outShufLo = [8]uint64{
+ 0x2018111008020100, 0x3a39383231302821, 0x6860595850494840, 0x1312090504036a69,
+ 0x3b35343329232219, 0x5b5a514b4a413d3c, 0x0a7007066d6c6b61, 0x37362a25241a1514,
+}
+var expandAVX512_26SIMD_outShufHi0 = [8]uint64{
+ 0x5851504842414038, 0x7978727170686160, 0xffffffffffffff7a, 0x52494544433b3a39,
+ 0x7574736963625953, 0xffffffffff7d7c7b, 0xff47463e3d3cffff, 0x766a65645a55544a,
+}
+var expandAVX512_26SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x20191810090800ff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0x1a110b0a01ffffff, 0x28ffffffffff211b, 0xffffffffffffffff,
+}
+
+func expandAVX512_26SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_26SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_26SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_26SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_26SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_26SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xff7c07ffff01ffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x83f80000fe0000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_28SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_28SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0xffffffffff020100, 0x0202010101000000, 0xff02010101000000,
+ 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100,
+}
+var expandAVX512_28SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808,
+}
+var expandAVX512_28SIMD_inShuf1 = [8]uint64{
+ 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000,
+ 0xffffffffffffff02, 0xffffffffffffff02, 0x0404040303030202, 0xffffffffff040302,
+}
+var expandAVX512_28SIMD_mat2 = [8]uint64{
+ 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020,
+ 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_28SIMD_inShuf2 = [8]uint64{
+ 0x0404030303020202, 0x0404030303020202, 0xffffffffffff0302, 0xffff030303020202,
+ 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303,
+}
+var expandAVX512_28SIMD_mat3 = [8]uint64{
+ 0x0101010102020202, 0x0202020202020202, 0x0808080808080808, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_28SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffff0403, 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_28SIMD_outShufLo = [8]uint64{
+ 0x1812111008020100, 0x31302a2928201a19, 0x4a49484241403832, 0x090504035a595850,
+ 0x2b211d1c1b151413, 0x4443393534332d2c, 0x5d5c5b514d4c4b45, 0x1e6817160a600706,
+}
+var expandAVX512_28SIMD_outShufHi0 = [8]uint64{
+ 0x4948424140383130, 0x6261605a5958504a, 0xff7a797872717068, 0x4339343332ffffff,
+ 0x5c5b514d4c4b4544, 0x757473696564635d, 0x35ffffffff7d7c7b, 0x4f4eff47463a3736,
+}
+var expandAVX512_28SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x00ffffffffffffff, 0xffffffffff0a0908,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xff0d0c0b01ffffff, 0xffff10ffffffffff,
+}
+
+func expandAVX512_28SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_28SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_28SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_28SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_28SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_28SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xdf87fffff87fffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x2078000007800000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_30SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+ 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_30SIMD_inShuf0 = [8]uint64{
+ 0x0202010101000000, 0xffffffffff020100, 0xffff010101000000, 0xffffffffffff0100,
+ 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000,
+}
+var expandAVX512_30SIMD_mat1 = [8]uint64{
+ 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+ 0x4040808080808080, 0x8080808080808080, 0x0101010101010101, 0x0202020202020202,
+}
+var expandAVX512_30SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000,
+ 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0x0404030303020202,
+}
+var expandAVX512_30SIMD_mat2 = [8]uint64{
+ 0x0202020204040404, 0x0404040404040404, 0x0404080808080808, 0x0808080808080808,
+ 0x1010101010101010, 0x1010101010102020, 0x2020202020202020, 0x2020202040404040,
+}
+var expandAVX512_30SIMD_inShuf2 = [8]uint64{
+ 0xffffffffff040302, 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202,
+ 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffffffffffff0302,
+}
+var expandAVX512_30SIMD_mat3 = [8]uint64{
+ 0x4040404040404040, 0x4040808080808080, 0x8080808080808080, 0x0101010101010101,
+ 0x0101010101010202, 0x0202020202020202, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_30SIMD_inShuf3 = [8]uint64{
+ 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303,
+ 0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_30SIMD_outShufLo = [8]uint64{
+ 0x1812111008020100, 0x3832313028222120, 0x58504a4948403a39, 0x04036a6968605a59,
+ 0x2423191514130905, 0x3d3c3b3534332925, 0x5d5c5b514d4c4b41, 0x0a7007066d6c6b61,
+}
+var expandAVX512_30SIMD_outShufHi0 = [8]uint64{
+ 0x504a4948403a3938, 0x70686261605a5958, 0xffffffffff787271, 0x3c3bffffffffffff,
+ 0x5c5b514d4c4b413d, 0x757473696564635d, 0xffffffffffffff79, 0x42ff3f3effffffff,
+}
+var expandAVX512_30SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x1008020100ffffff, 0xffff201a19181211,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x15141309050403ff, 0xff28ffff211d1c1b,
+}
+
+func expandAVX512_30SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_30SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_30SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_30SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_30SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_30SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xb001ffffc007ffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x4ffe00003ff80000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_32SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_32SIMD_inShuf0 = [8]uint64{
+ 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000,
+ 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000,
+}
+var expandAVX512_32SIMD_inShuf1 = [8]uint64{
+ 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202,
+ 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202,
+}
+var expandAVX512_32SIMD_outShufLo = [8]uint64{
+ 0x0b0a090803020100, 0x1b1a191813121110, 0x2b2a292823222120, 0x3b3a393833323130,
+ 0x0f0e0d0c07060504, 0x1f1e1d1c17161514, 0x2f2e2d2c27262524, 0x3f3e3d3c37363534,
+}
+
+func expandAVX512_32SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_32SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_32SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_32SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_32SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_36SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_36SIMD_inShuf0 = [8]uint64{
+ 0x0101010100000000, 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000,
+ 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000, 0xffffffffffff0100,
+}
+var expandAVX512_36SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_36SIMD_inShuf1 = [8]uint64{
+ 0x0101010100000000, 0xffffff0100000000, 0xffffffffffffff00, 0xffffffff00000000,
+ 0xff02020202010101, 0xffffffffffff0201, 0x0202020201010101, 0x0303030302020202,
+}
+var expandAVX512_36SIMD_mat2 = [8]uint64{
+ 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808,
+ 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020,
+}
+var expandAVX512_36SIMD_inShuf2 = [8]uint64{
+ 0xffffffffffff0302, 0x0303030302020202, 0x0303030302020202, 0xffffffffffff0302,
+ 0x0303030302020202, 0xffff030302020202, 0xffffffffffffff02, 0xffffffff02020202,
+}
+var expandAVX512_36SIMD_outShufLo = [8]uint64{
+ 0x1211100803020100, 0x2928201b1a191813, 0x4038333231302b2a, 0x504b4a4948434241,
+ 0x070605045b5a5958, 0x1e1d1c1716151409, 0x35342f2e2d2c211f, 0x4c47464544393736,
+}
+var expandAVX512_36SIMD_outShufHi = [8]uint64{
+ 0x3332313028222120, 0x4a4948403b3a3938, 0x616058535251504b, 0x78706b6a69686362,
+ 0x29262524237b7a79, 0x3f3e3d3c37363534, 0x5655544f4e4d4c41, 0x6d6c676665645957,
+}
+
+func expandAVX512_36SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_36SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_36SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_36SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_36SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_36SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_36SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_36SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_36SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_40SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_40SIMD_inShuf0 = [8]uint64{
+ 0x0101010000000000, 0x0101010000000000, 0x0101010000000000, 0x0101010000000000,
+ 0x0101010000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000,
+}
+var expandAVX512_40SIMD_mat1 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040,
+}
+var expandAVX512_40SIMD_inShuf1 = [8]uint64{
+ 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101,
+ 0xffffffffffffff01, 0xffff020202020201, 0x0202020101010101, 0x0202020101010101,
+}
+var expandAVX512_40SIMD_mat2 = [8]uint64{
+ 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0404040404040404,
+ 0x0808080808080808, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_40SIMD_inShuf2 = [8]uint64{
+ 0x0202020101010101, 0x0303030202020202, 0x0303030202020202, 0xffffff0202020202,
+ 0xffffff0202020202, 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffff0202,
+}
+var expandAVX512_40SIMD_mat3 = [8]uint64{
+ 0x0101010101010101, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_40SIMD_inShuf3 = [8]uint64{
+ 0xffffffffffff0303, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_40SIMD_outShufLo = [8]uint64{
+ 0x0a09080403020100, 0x1814131211100c0b, 0x232221201c1b1a19, 0x31302c2b2a292824,
+ 0x3c3b3a3938343332, 0x0f0e0d4140070605, 0x1d51501716154948, 0x6027262559581f1e,
+}
+var expandAVX512_40SIMD_outShufHi0 = [8]uint64{
+ 0x3938343332313028, 0x44434241403c3b3a, 0x5251504c4b4a4948, 0x605c5b5a59585453,
+ 0x2c2b2a2964636261, 0x3e3d69683736352d, 0x797847464571703f, 0x575655ffff4f4e4d,
+}
+var expandAVX512_40SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffff0100ffffff,
+}
+
+func expandAVX512_40SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_40SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_40SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_40SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_40SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_40SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xe7ffffffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x1800000000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_44SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_44SIMD_inShuf0 = [8]uint64{
+ 0x0101010000000000, 0xffffffffffff0100, 0x0101010000000000, 0x0101010000000000,
+ 0xffffffffffff0100, 0x0101010000000000, 0xffffff0000000000, 0xffffffffffffff00,
+}
+var expandAVX512_44SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+}
+var expandAVX512_44SIMD_inShuf1 = [8]uint64{
+ 0xffffff0000000000, 0xffffff0000000000, 0xffffffffffffff00, 0xffffff0000000000,
+ 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xff02020202020101,
+}
+var expandAVX512_44SIMD_mat2 = [8]uint64{
+ 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040,
+ 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202,
+}
+var expandAVX512_44SIMD_inShuf2 = [8]uint64{
+ 0x0202020101010101, 0xffffffffffff0201, 0x0202020101010101, 0x0202020101010101,
+ 0xffffffffffff0201, 0xffff020101010101, 0xffffff0202020202, 0xffffffffffffff02,
+}
+var expandAVX512_44SIMD_mat3 = [8]uint64{
+ 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x1010101010101010,
+ 0x2020202020202020, 0x4040404040404040, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_44SIMD_inShuf3 = [8]uint64{
+ 0xffffff0202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffff0202,
+ 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_44SIMD_outShufLo = [8]uint64{
+ 0x1110080403020100, 0x1c1b1a1918141312, 0x31302c2b2a292820, 0x4342414038343332,
+ 0x58504c4b4a494844, 0x600706055c5b5a59, 0x1d69681716150961, 0x2f2e2d2171701f1e,
+}
+var expandAVX512_44SIMD_outShufHi0 = [8]uint64{
+ 0x4844434241403938, 0x5a59585453525150, 0x6c6b6a6968605c5b, 0xffff787473727170,
+ 0xffffffffffffffff, 0x46453e3d3c3b3aff, 0xff57565549ffff47, 0x6d61ffff5f5e5dff,
+}
+var expandAVX512_44SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x0100ffffffffffff,
+ 0x0c0b0a0908040302, 0xffffffffffffff10, 0x20ffffffff1918ff, 0xffff2928ffffff21,
+}
+
+func expandAVX512_44SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_44SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_44SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_44SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_44SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_44SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0xce79fe003fffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x318601ffc0000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_48SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_48SIMD_inShuf0 = [8]uint64{
+ 0x0101000000000000, 0x0101000000000000, 0x0101000000000000, 0xffff000000000000,
+ 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000,
+}
+var expandAVX512_48SIMD_mat1 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040404040404,
+ 0x0808080808080808, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040,
+}
+var expandAVX512_48SIMD_inShuf1 = [8]uint64{
+ 0xffffffff01010101, 0xffffffff01010101, 0xffffffffffff0101, 0x0202020202020101,
+ 0x0202010101010101, 0x0202010101010101, 0x0202010101010101, 0xffff010101010101,
+}
+var expandAVX512_48SIMD_mat2 = [8]uint64{
+ 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0808080808080808,
+ 0x1010101010101010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_48SIMD_inShuf2 = [8]uint64{
+ 0xffff010101010101, 0xffff020202020202, 0xffff020202020202, 0xffffffff02020202,
+ 0xffffffff02020202, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_48SIMD_outShufLo = [8]uint64{
+ 0x0908050403020100, 0x131211100d0c0b0a, 0x1d1c1b1a19181514, 0x2928252423222120,
+ 0x333231302d2c2b2a, 0x3d3c3b3a39383534, 0x0f0e434241400706, 0x515017164b4a4948,
+}
+var expandAVX512_48SIMD_outShufHi = [8]uint64{
+ 0x2524232221201918, 0x31302d2c2b2a2928, 0x3b3a393835343332, 0x4544434241403d3c,
+ 0x51504d4c4b4a4948, 0x1d1c1b1a55545352, 0x5b5a595827261f1e, 0x3736636261602f2e,
+}
+
+func expandAVX512_48SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_48SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_48SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_48SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_48SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_48SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_48SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_48SIMD_outShufLo).AsUint8x64()
+ v15 := simd.LoadUint64x8(&expandAVX512_48SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v14 := v4.Permute2(v8, v13)
+ v16 := v8.Permute2(v12, v15)
+ return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_52SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_52SIMD_inShuf0 = [8]uint64{
+ 0x0101000000000000, 0xffffffffffff0100, 0x0101000000000000, 0xffff000000000000,
+ 0xffffffffffffff00, 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00,
+}
+var expandAVX512_52SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0202020202020202, 0x0202020202020202, 0x0404040404040404,
+}
+var expandAVX512_52SIMD_inShuf1 = [8]uint64{
+ 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00, 0xffff000000000000,
+ 0xffffffff01010101, 0xffffffffff010101, 0xff02020202020201, 0x0202010101010101,
+}
+var expandAVX512_52SIMD_mat2 = [8]uint64{
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+}
+var expandAVX512_52SIMD_inShuf2 = [8]uint64{
+ 0xffffffffffff0201, 0x0202010101010101, 0xffff010101010101, 0xffffffffffffff01,
+ 0xffff010101010101, 0xffff010101010101, 0xffffffffffffff01, 0xffff010101010101,
+}
+var expandAVX512_52SIMD_mat3 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0404040404040404, 0x0808080808080808,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_52SIMD_inShuf3 = [8]uint64{
+ 0xffff020202020202, 0xffffffffffffff02, 0xffffffff02020202, 0xffffffffffff0202,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_52SIMD_outShufLo = [8]uint64{
+ 0x1008050403020100, 0x1a19181514131211, 0x2b2a2928201d1c1b, 0x3534333231302d2c,
+ 0x4845444342414038, 0x5958504d4c4b4a49, 0x616007065d5c5b5a, 0x6a69681716096362,
+}
+var expandAVX512_52SIMD_outShufHi0 = [8]uint64{
+ 0x403d3c3b3a393830, 0x51504d4c4b4a4948, 0x6261605855545352, 0x6c6b6a6968656463,
+ 0x7d7c7b7a7978706d, 0x31ffffffffffffff, 0xff3f3e3635343332, 0xffff4f4e41ffffff,
+}
+var expandAVX512_52SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xff08050403020100, 0x10ffffffffffffff, 0x1918ffffff131211,
+}
+
+func expandAVX512_52SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_52SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_52SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_52SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_52SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_52SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0x387f80ffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0xc7807f0000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_56SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_56SIMD_inShuf0 = [8]uint64{
+ 0x0100000000000000, 0x0100000000000000, 0xff00000000000000, 0xff00000000000000,
+ 0xff00000000000000, 0xff00000000000000, 0xff00000000000000, 0xff00000000000000,
+}
+var expandAVX512_56SIMD_inShuf1 = [8]uint64{
+ 0xffff010101010101, 0x0202010101010101, 0x0201010101010101, 0xff01010101010101,
+ 0xff01010101010101, 0xff01010101010101, 0xff01010101010101, 0xff01010101010101,
+}
+var expandAVX512_56SIMD_mat2 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_56SIMD_inShuf2 = [8]uint64{
+ 0xff02020202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_56SIMD_outShufLo = [8]uint64{
+ 0x0806050403020100, 0x11100e0d0c0b0a09, 0x1a19181615141312, 0x232221201e1d1c1b,
+ 0x2c2b2a2928262524, 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0f45444342414007,
+}
+var expandAVX512_56SIMD_outShufHi = [8]uint64{
+ 0x11100d0c0b0a0908, 0x1a19181615141312, 0x232221201e1d1c1b, 0x2c2b2a2928262524,
+ 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0e46454443424140, 0x50174c4b4a49480f,
+}
+
+func expandAVX512_56SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_56SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_56SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_56SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_56SIMD_mat2).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_56SIMD_inShuf2).AsUint8x64()
+ v12 := simd.LoadUint64x8(&expandAVX512_56SIMD_outShufLo).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_56SIMD_outShufHi).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v10 := v0.Permute(v9)
+ v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0)
+ v13 := v4.Permute2(v7, v12)
+ v15 := v7.Permute2(v11, v14)
+ return v13.AsUint64x8(), v15.AsUint64x8()
+}
+
+var expandAVX512_60SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+ 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_60SIMD_inShuf0 = [8]uint64{
+ 0x0100000000000000, 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000,
+ 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00,
+}
+var expandAVX512_60SIMD_mat1 = [8]uint64{
+ 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+ 0x0101010101010101, 0x0101010101010101, 0x0101010102020202, 0x0202020202020202,
+}
+var expandAVX512_60SIMD_inShuf1 = [8]uint64{
+ 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00, 0xff00000000000000,
+ 0xffffffffff010101, 0x0202020202010101, 0xffffffffffff0201, 0xff01010101010101,
+}
+var expandAVX512_60SIMD_mat2 = [8]uint64{
+ 0x0404040404040404, 0x0404040408080808, 0x0808080808080808, 0x1010101010101010,
+ 0x1010101020202020, 0x2020202020202020, 0x4040404040404040, 0x4040404080808080,
+}
+var expandAVX512_60SIMD_inShuf2 = [8]uint64{
+ 0xff01010101010101, 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101,
+ 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101, 0xffffffffffffff01,
+}
+var expandAVX512_60SIMD_mat3 = [8]uint64{
+ 0x8080808080808080, 0x0101010101010101, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_60SIMD_inShuf3 = [8]uint64{
+ 0xff01010101010101, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_60SIMD_outShufLo = [8]uint64{
+ 0x0806050403020100, 0x1816151413121110, 0x28201e1d1c1b1a19, 0x31302e2d2c2b2a29,
+ 0x4140383635343332, 0x4a49484645444342, 0x5a5958504e4d4c4b, 0x626160075e5d5c5b,
+}
+var expandAVX512_60SIMD_outShufHi0 = [8]uint64{
+ 0x3b3a3938302a2928, 0x44434241403e3d3c, 0x5453525150484645, 0x5d5c5b5a59585655,
+ 0x6d6c6b6a6968605e, 0x767574737271706e, 0xffffffffffffff78, 0x31ffff2f2e2d2c2b,
+}
+var expandAVX512_60SIMD_outShufHi1 = [8]uint64{
+ 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+ 0xffffffffffffffff, 0xffffffffffffffff, 0x06050403020100ff, 0xff0908ffffffffff,
+}
+
+func expandAVX512_60SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat1).AsUint8x64()
+ v6 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf1).AsUint8x64()
+ v9 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat2).AsUint8x64()
+ v10 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf2).AsUint8x64()
+ v13 := simd.LoadUint64x8(&expandAVX512_60SIMD_mat3).AsUint8x64()
+ v14 := simd.LoadUint64x8(&expandAVX512_60SIMD_inShuf3).AsUint8x64()
+ v17 := simd.LoadUint64x8(&expandAVX512_60SIMD_outShufLo).AsUint8x64()
+ v19 := simd.LoadUint64x8(&expandAVX512_60SIMD_outShufHi0).AsUint8x64()
+ v20 := simd.LoadUint64x8(&expandAVX512_60SIMD_outShufHi1).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v7 := v0.Permute(v6)
+ v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+ v11 := v0.Permute(v10)
+ v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+ v15 := v0.Permute(v14)
+ v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+ v18 := v4.Permute2(v8, v17)
+ u0 := uint64(0x9f01ffffffffffff)
+ m0 := simd.Mask8x64FromBits(u0)
+ v21 := v8.Permute2(v12, v19).Masked(m0)
+ u1 := uint64(0x60fe000000000000)
+ m1 := simd.Mask8x64FromBits(u1)
+ v22 := v16.Permute(v20).Masked(m1)
+ v23 := v21.Or(v22)
+ return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_64SIMD_mat0 = [8]uint64{
+ 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+ 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_64SIMD_inShuf0 = [8]uint64{
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_64SIMD_inShuf1 = [8]uint64{
+ 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101,
+ 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101,
+}
+var expandAVX512_64SIMD_outShufLo = [8]uint64{
+ 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x1716151413121110, 0x1f1e1d1c1b1a1918,
+ 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x3736353433323130, 0x3f3e3d3c3b3a3938,
+}
+
+func expandAVX512_64SIMD(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ v1 := simd.LoadUint64x8(&expandAVX512_64SIMD_mat0).AsUint8x64()
+ v2 := simd.LoadUint64x8(&expandAVX512_64SIMD_inShuf0).AsUint8x64()
+ v5 := simd.LoadUint64x8(&expandAVX512_64SIMD_inShuf1).AsUint8x64()
+ v8 := simd.LoadUint64x8(&expandAVX512_64SIMD_outShufLo).AsUint8x64()
+ v3 := v0.Permute(v2)
+ v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v6 := v0.Permute(v5)
+ v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+ v9 := v4.Permute(v8)
+ v10 := v7.Permute(v8)
+ return v9.AsUint64x8(), v10.AsUint64x8()
+}
diff --git a/src/internal/runtime/gc/scan/mkexpanders.go b/src/internal/runtime/gc/scan/mkexpanders.go
new file mode 100644
index 0000000..bbfcb37
--- /dev/null
+++ b/src/internal/runtime/gc/scan/mkexpanders.go
@@ -0,0 +1,625 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is a fork of mkasm.go, instead of generating
+// assemblies, this file generates Go code using the simd
+// package via GOEXPERIMENT.
+
+//go:build ignore
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "go/format"
+ "log"
+ "os"
+ "slices"
+ "strconv"
+ "text/template"
+ "unsafe"
+
+ "internal/runtime/gc"
+)
+
+var simdTemplate = template.Must(template.New("template").Parse(`
+{{- define "header"}}
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+ "simd"
+ "unsafe"
+)
+{{- end}}
+{{- define "expandersList"}}
+var gcExpandersAVX512SIMD = [{{.NumFuncs}}]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+{{- range .Funcs}}
+ {{.}},
+{{- end}}
+}
+{{- end}}
+
+{{- define "expanderData"}}
+var {{.Name}} = [8]uint64{
+{{.Vals}}
+}
+{{- end}}
+
+{{- define "expander"}}
+func {{.Name}}(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+ {{- .BodyLoad}}
+ {{- .Body}}
+}
+{{- end}}
+`))
+
+type expandersListData struct {
+ NumFuncs int
+ Funcs []string
+}
+
+type expanderDataData struct {
+ Name string
+ Vals string
+}
+
+type expanderData struct {
+ Name string
+ BodyLoad string
+ Body string
+ data []expanderDataData
+ dataV2N map[string]string
+ uint8x64Cnt int
+ mask8x64Cnt int
+ uint64Cnt int
+}
+
+func main() {
+ generate("expanders_amd64.go", genExpanders)
+}
+
+func generate(fileName string, genFunc func(*bytes.Buffer)) {
+ var buf bytes.Buffer
+ genFunc(&buf)
+ f, err := os.Create(fileName)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ b, err := format.Source(buf.Bytes())
+ if err != nil {
+ log.Printf(string(buf.Bytes()))
+ log.Fatal(err)
+ }
+ _, err = f.Write(b)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func genExpanders(buffer *bytes.Buffer) {
+ if err := simdTemplate.ExecuteTemplate(buffer, "header", nil); err != nil {
+ panic(fmt.Errorf("failed to execute header template: %w", err))
+ }
+ gcExpandersAVX512 := make([]expanderData, len(gc.SizeClassToSize))
+ for sc, ob := range gc.SizeClassToSize {
+ if gc.SizeClassToNPages[sc] != 1 {
+ // These functions all produce a bitmap that covers exactly one
+ // page.
+ continue
+ }
+ if ob > gc.MinSizeForMallocHeader {
+ // This size class is too big to have a packed pointer/scalar bitmap.
+ break
+ }
+
+ xf := int(ob) / 8
+ log.Printf("size class %d bytes, expansion %dx", ob, xf)
+
+ fn := expanderData{Name: fmt.Sprintf("expandAVX512_%dSIMD", xf), dataV2N: make(map[string]string)}
+
+ if xf == 1 {
+ fn.expandIdentity()
+ } else {
+ ok := gfExpander(xf, &fn)
+ if !ok {
+ log.Printf("failed to generate expander for size class %d", sc)
+ }
+ }
+ gcExpandersAVX512[sc] = fn
+ }
+ // Fill in the expanders data first
+ eld := expandersListData{len(gcExpandersAVX512), make([]string, len(gcExpandersAVX512))}
+ for i, gce := range gcExpandersAVX512 {
+ if gce.Name == "" {
+ eld.Funcs[i] = "nil"
+ } else {
+ eld.Funcs[i] = gce.Name
+ }
+ }
+ if err := simdTemplate.ExecuteTemplate(buffer, "expandersList", eld); err != nil {
+ panic(fmt.Errorf("failed to execute expandersList template: %w", err))
+ }
+ // List out the expander functions and their data
+ for _, gce := range gcExpandersAVX512 {
+ if gce.Name == "" {
+ continue
+ }
+ for _, data := range gce.data {
+ if err := simdTemplate.ExecuteTemplate(buffer, "expanderData", data); err != nil {
+ panic(fmt.Errorf("failed to execute expanderData template: %w", err))
+ }
+ }
+ if err := simdTemplate.ExecuteTemplate(buffer, "expander", gce); err != nil {
+ panic(fmt.Errorf("failed to execute expander template: %w", err))
+ }
+ }
+}
+
+// mat8x8 is an 8x8 bit matrix.
+type mat8x8 struct {
+ mat [8]uint8
+}
+
+func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
+ var out [8]uint64
+ for i, mat := range mats {
+ for j, row := range mat.mat {
+ // For some reason, Intel flips the rows.
+ out[i] |= uint64(row) << ((7 - j) * 8)
+ }
+ }
+ return out
+}
+
+func (fn *expanderData) newVec() string {
+ v := fmt.Sprintf("v%d", fn.uint8x64Cnt)
+ fn.uint8x64Cnt++
+ return v
+}
+
+func (fn *expanderData) newMask() string {
+ v := fmt.Sprintf("m%d", fn.mask8x64Cnt)
+ fn.mask8x64Cnt++
+ return v
+}
+
+func (fn *expanderData) newU() string {
+ v := fmt.Sprintf("u%d", fn.uint64Cnt)
+ fn.uint64Cnt++
+ return v
+}
+
+// expandIdentity implements 1x expansion (that is, no expansion).
+func (fn *expanderData) expandIdentity() {
+ fn.Body = `
+ x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+ y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src)+64))).AsUint8x64()
+ return x.AsUint64x8(), y.AsUint64x8()`
+}
+
+func (fn *expanderData) loadSrcAsUint8x64() string {
+ v := fn.newVec()
+ fn.BodyLoad += fmt.Sprintf("%s := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()\n", v)
+ return v
+}
+
+func (fn *expanderData) loadGlobalArrAsUint8x64(arrName string) string {
+ v := fn.newVec()
+ fn.BodyLoad += fmt.Sprintf("%s := simd.LoadUint64x8(&%s).AsUint8x64()\n", v, arrName)
+ return v
+}
+
+func (fn *expanderData) permuteUint8x64(data, indices string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute(%s)\n", v, data, indices)
+ return v
+}
+
+func (fn *expanderData) permute2Uint8x64(x, y, indices string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute2(%s, %s)\n", v, x, y, indices)
+ return v
+}
+
+func (fn *expanderData) permuteMaskedUint8x64(data, indices, mask string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute(%s).Masked(%s)\n", v, data, indices, mask)
+ return v
+}
+
+func (fn *expanderData) permute2MaskedUint8x64(x, y, indices, mask string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Permute2(%s, %s).Masked(%s)\n", v, x, y, indices, mask)
+ return v
+}
+
+func (fn *expanderData) galoisFieldAffineTransformUint8x64(data, matrix string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix)
+ return v
+}
+
+func (fn *expanderData) returns(x, y string) {
+ fn.Body += fmt.Sprintf("return %s.AsUint64x8(), %s.AsUint64x8()", x, y)
+}
+
+func uint8x64Data(data [64]uint8) string {
+ res := ""
+ for i := range 8 {
+ ptr64 := (*uint64)(unsafe.Pointer(&data[i*8]))
+ res += fmt.Sprintf("%#016x,", *ptr64)
+ if i == 3 {
+ res += "\n"
+ }
+ }
+ return res
+}
+
+func uint64x8Data(data [8]uint64) string {
+ res := ""
+ for i := range 8 {
+ res += fmt.Sprintf("%#016x,", data[i])
+ if i == 3 {
+ res += "\n"
+ }
+ }
+ return res
+}
+
+func (fn *expanderData) loadGlobalUint8x64(name string, data [64]uint8) string {
+ val := uint8x64Data(data)
+ if n, ok := fn.dataV2N[val]; !ok {
+ fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+ fn.data = append(fn.data, expanderDataData{fullName, val})
+ v := fn.loadGlobalArrAsUint8x64(fullName)
+ fn.dataV2N[val] = v
+ return v
+ } else {
+ return n
+ }
+}
+
+func (fn *expanderData) loadGlobalUint64x8(name string, data [8]uint64) string {
+ val := uint64x8Data(data)
+ if n, ok := fn.dataV2N[val]; !ok {
+ fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+ fn.data = append(fn.data, expanderDataData{fullName, val})
+ v := fn.loadGlobalArrAsUint8x64(fullName)
+ fn.dataV2N[val] = v
+ return v
+ } else {
+ return n
+ }
+}
+
+func (fn *expanderData) mask8x64FromBits(data uint64) string {
+ v1 := fn.newU()
+ v2 := fn.newMask()
+ fn.Body += fmt.Sprintf("%s := uint64(%#x)\n%s := simd.Mask8x64FromBits(%s)\n",
+ v1, data, v2, v1)
+ return v2
+}
+
+func (fn *expanderData) orUint8x64(x, y string) string {
+ v := fn.newVec()
+ fn.Body += fmt.Sprintf("%s := %s.Or(%s)\n", v, x, y)
+ return v
+}
+
+// gfExpander produces a function that expands each bit in an input bitmap into
+// f consecutive bits in an output bitmap.
+//
+// The input is
+//
+// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
+//
+// The output is
+//
+// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
+// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
+//
+// TODO(austin): This should Z0/Z1.
+func gfExpander(f int, fn *expanderData) bool {
+ // TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
+
+ // TODO(austin): For f >= 8, I suspect there are better ways to do this.
+ //
+ // For example, we could use a mask expansion to get a full byte for each
+ // input bit, and separately create the bytes that blend adjacent bits, then
+ // shuffle those bytes together. Certainly for f >= 16 this makes sense
+ // because each of those bytes will be used, possibly more than once.
+
+ objBits := fn.loadSrcAsUint8x64()
+
+ type term struct {
+ iByte, oByte int
+ mat mat8x8
+ }
+ var terms []term
+
+ // Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
+ // the output byte from the appropriate input byte. Gather all of these into
+ // "terms".
+ for oByte := 0; oByte < 1024/8; oByte++ {
+ var byteMat mat8x8
+ iByte := -1
+ for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
+ iBit := oBit / f
+ if iByte == -1 {
+ iByte = iBit / 8
+ } else if iByte != iBit/8 {
+ log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
+ return false
+ }
+ // One way to view this is that the i'th row of the matrix will be
+ // ANDed with the input byte, and the parity of the result will set
+ // the i'th bit in the output. We use a simple 1 bit mask, so the
+ // parity is irrelevant beyond selecting out that one bit.
+ byteMat.mat[oBit%8] = 1 << (iBit % 8)
+ }
+ terms = append(terms, term{iByte, oByte, byteMat})
+ }
+
+ if false {
+ // Print input byte -> output byte as a matrix
+ maxIByte, maxOByte := 0, 0
+ for _, term := range terms {
+ maxIByte = max(maxIByte, term.iByte)
+ maxOByte = max(maxOByte, term.oByte)
+ }
+ iToO := make([][]rune, maxIByte+1)
+ for i := range iToO {
+ iToO[i] = make([]rune, maxOByte+1)
+ }
+ matMap := make(map[mat8x8]int)
+ for _, term := range terms {
+ i, ok := matMap[term.mat]
+ if !ok {
+ i = len(matMap)
+ matMap[term.mat] = i
+ }
+ iToO[term.iByte][term.oByte] = 'A' + rune(i)
+ }
+ for o := range maxOByte + 1 {
+ fmt.Printf("%d", o)
+ for i := range maxIByte + 1 {
+ fmt.Printf(",")
+ if mat := iToO[i][o]; mat != 0 {
+ fmt.Printf("%c", mat)
+ }
+ }
+ fmt.Println()
+ }
+ }
+
+ // In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
+ // and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
+ //
+ // abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
+ // mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7
+
+ // Group the terms by matrix, but limit each group to 8 terms.
+ const termsPerGroup = 8 // Number of terms we can multiply by the same matrix.
+ const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
+
+ matMap := make(map[mat8x8]int)
+ allMats := make(map[mat8x8]bool)
+ var termGroups [][]term
+ for _, term := range terms {
+ allMats[term.mat] = true
+
+ i, ok := matMap[term.mat]
+ if ok && f > groupsPerSuperGroup {
+ // The output is ultimately produced in two [64]uint8 registers.
+ // Getting every byte in the right place of each of these requires a
+ // final permutation that often requires more than one source.
+ //
+ // Up to 8x expansion, we can get a really nice grouping so we can use
+ // the same 8 matrix vector several times, without producing
+ // permutations that require more than two sources.
+ //
+ // Above 8x, however, we can't get nice matrixes anyway, so we
+ // instead prefer reducing the complexity of the permutations we
+ // need to produce the final outputs. To do this, avoid grouping
+ // together terms that are split across the two registers.
+ outRegister := termGroups[i][0].oByte / 64
+ if term.oByte/64 != outRegister {
+ ok = false
+ }
+ }
+ if !ok {
+ // Start a new term group.
+ i = len(termGroups)
+ matMap[term.mat] = i
+ termGroups = append(termGroups, nil)
+ }
+
+ termGroups[i] = append(termGroups[i], term)
+
+ if len(termGroups[i]) == termsPerGroup {
+ // This term group is full.
+ delete(matMap, term.mat)
+ }
+ }
+
+ for i, termGroup := range termGroups {
+ log.Printf("term group %d:", i)
+ for _, term := range termGroup {
+ log.Printf(" %+v", term)
+ }
+ }
+
+ // We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
+ // as many term groups as we can into each super-group to minimize the
+ // number of matrix multiplies.
+ //
+ // Ideally, we use the same matrix in each super-group, which might mean
+ // doing fewer than 8 multiplies at a time. That's fine because it never
+ // increases the total number of matrix multiplies.
+ //
+ // TODO: Packing the matrixes less densely may let us use more broadcast
+ // loads instead of general permutations, though. That replaces a load of
+ // the permutation with a load of the matrix, but is probably still slightly
+ // better.
+ var sgSize, nSuperGroups int
+ oneMatVec := f <= groupsPerSuperGroup
+ if oneMatVec {
+ // We can use the same matrix in each multiply by doing sgSize
+ // multiplies at a time.
+ sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
+ nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
+ } else {
+ // We can't use the same matrix for each multiply. Just do as many at a
+ // time as we can.
+ //
+ // TODO: This is going to produce several distinct matrixes, when we
+ // probably only need two. Be smarter about how we create super-groups
+ // in this case. Maybe we build up an array of super-groups and then the
+ // loop below just turns them into ops?
+ sgSize = 8
+ nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
+ }
+
+ // Construct each super-group.
+ var matGroup [8]mat8x8
+ var matMuls []string
+ var perm [128]int
+ for sgi := range nSuperGroups {
+ var iperm [64]uint8
+ for i := range iperm {
+ iperm[i] = 0xff // "Don't care"
+ }
+ // Pick off sgSize term groups.
+ superGroup := termGroups[:min(len(termGroups), sgSize)]
+ termGroups = termGroups[len(superGroup):]
+ // Build the matrix and permutations for this super-group.
+ var thisMatGroup [8]mat8x8
+ for i, termGroup := range superGroup {
+ // All terms in this group have the same matrix. Pick one.
+ thisMatGroup[i] = termGroup[0].mat
+ for j, term := range termGroup {
+ // Build the input permutation.
+ iperm[i*termsPerGroup+j] = uint8(term.iByte)
+ // Build the output permutation.
+ perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
+ }
+ }
+ log.Printf("input permutation %d: %v", sgi, iperm)
+
+ // Check that we're not making more distinct matrixes than expected.
+ if oneMatVec {
+ if sgi == 0 {
+ matGroup = thisMatGroup
+ } else if matGroup != thisMatGroup {
+ log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
+ return false
+ }
+ }
+
+ // Emit matrix op.
+ matConst :=
+ fn.loadGlobalUint64x8(fmt.Sprintf("mat%d", sgi),
+ matGroupToVec(&thisMatGroup))
+ inShufConst :=
+ fn.loadGlobalUint8x64(fmt.Sprintf("inShuf%d", sgi),
+ iperm)
+ inOp := fn.permuteUint8x64(objBits, inShufConst)
+ matMul := fn.galoisFieldAffineTransformUint8x64(inOp, matConst)
+ matMuls = append(matMuls, matMul)
+ }
+
+ log.Printf("output permutation: %v", perm)
+
+ outLo, ok := genShuffle(fn, "outShufLo", (*[64]int)(perm[:64]), matMuls...)
+ if !ok {
+ log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+ return false
+ }
+ outHi, ok := genShuffle(fn, "outShufHi", (*[64]int)(perm[64:]), matMuls...)
+ if !ok {
+ log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+ return false
+ }
+ fn.returns(outLo, outHi)
+
+ return true
+}
+
+func genShuffle(fn *expanderData, name string, perm *[64]int, args ...string) (string, bool) {
+ // Construct flattened permutation.
+ var vperm [64]byte
+
+ // Get the inputs used by this permutation.
+ var inputs []int
+ for i, src := range perm {
+ inputIdx := slices.Index(inputs, src/64)
+ if inputIdx == -1 {
+ inputIdx = len(inputs)
+ inputs = append(inputs, src/64)
+ }
+ vperm[i] = byte(src%64 | (inputIdx << 6))
+ }
+
+ // Emit instructions for easy cases.
+ switch len(inputs) {
+ case 1:
+ constOp := fn.loadGlobalUint8x64(name, vperm)
+ return fn.permuteUint8x64(args[inputs[0]], constOp), true
+ case 2:
+ constOp := fn.loadGlobalUint8x64(name, vperm)
+ return fn.permute2Uint8x64(args[inputs[0]], args[inputs[1]], constOp), true
+ }
+
+ // Harder case, we need to shuffle in from up to 2 more tables.
+ //
+ // Perform two shuffles. One shuffle will get its data from the first
+ // two inputs, the other shuffle will get its data from the other one
+ // or two inputs. All values they don't care each don't care about will
+ // be zeroed.
+ var vperms [2][64]byte
+ var masks [2]uint64
+ for j, idx := range vperm {
+ for i := range vperms {
+ vperms[i][j] = 0xff // "Don't care"
+ }
+ if idx == 0xff {
+ continue
+ }
+ vperms[idx/128][j] = idx % 128
+ masks[idx/128] |= uint64(1) << j
+ }
+
+ // Validate that the masks are fully disjoint.
+ if masks[0]^masks[1] != ^uint64(0) {
+ panic("bad shuffle!")
+ }
+
+ // Generate constants.
+ constOps := make([]string, len(vperms))
+ for i, v := range vperms {
+ constOps[i] = fn.loadGlobalUint8x64(name+strconv.Itoa(i), v)
+ }
+
+ // Generate shuffles.
+ switch len(inputs) {
+ case 3:
+ r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+ r1 := fn.permuteMaskedUint8x64(args[inputs[2]], constOps[1], fn.mask8x64FromBits(masks[1]))
+ return fn.orUint8x64(r0, r1), true
+ case 4:
+ r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+ r1 := fn.permute2MaskedUint8x64(args[inputs[2]], args[inputs[3]], constOps[1], fn.mask8x64FromBits(masks[1]))
+ return fn.orUint8x64(r0, r1), true
+ }
+
+ // Too many inputs. To support more, we'd need to separate tables much earlier.
+ // Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
+ return args[0], false
+}
diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go
index 2ac181f..85ef4ea 100644
--- a/src/internal/runtime/gc/scan/scan_amd64.go
+++ b/src/internal/runtime/gc/scan/scan_amd64.go
@@ -5,13 +5,23 @@
package scan

import (
+ "internal/abi"
"internal/cpu"
"internal/runtime/gc"
+ "math/bits"
+ "simd"
"unsafe"
)

func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
if CanAVX512() {
+ return ScanSpanPackedAVX512SIMD(mem, bufp, objMarks, sizeClass, ptrMask)
+ }
+ panic("not implemented")
+}
+
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ if CanAVX512() {
return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
}
panic("not implemented")
@@ -34,6 +44,64 @@
//go:noescape
func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)

+func FilterNilSIMD(bufp *uintptr, n int32) (cnt int32) {
+ scanned := 0
+ buf := unsafe.Slice((*int64)(unsafe.Pointer(bufp)), int(n))
+ // Use the widest vector
+ var plainZeros simd.Int64x8
+ for ; scanned+8 <= int(n); scanned += 8 {
+ v := simd.LoadInt64x8Slice(buf[scanned:])
+ m := v.NotEqual(plainZeros)
+ v.Compress(m).StoreSlice(buf[cnt:])
+ // Count the mask bits
+ mbits := uint64(m.ToBits())
+ mbits &= 0xFF // Only the lower 8 bits are meaningful.
+ nonNilCnt := bits.OnesCount64(mbits)
+ cnt += int32(nonNilCnt)
+ }
+ // Scalar code to clean up tails.
+ for i := scanned; i < int(n); i++ {
+ if buf[i] != 0 {
+ buf[cnt] = buf[i]
+ cnt++
+ }
+ }
+ return
+}
+
+func ScanSpanPackedAVX512SIMD(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ return FilterNilSIMD(bufp, scanSpanPackedAVX512SIMD(mem, bufp, objMarks, sizeClass, ptrMask))
+}
+
+func scanSpanPackedAVX512SIMD(mem unsafe.Pointer, buf *uintptr, objDarts *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ m1, m2 := gcExpandersAVX512SIMD[sizeClass](abi.NoEscape(unsafe.Pointer(objDarts)))
+ ptrm := unsafe.Pointer(ptrMask)
+ m3 := simd.LoadUint64x8((*[8]uint64)(ptrm))
+ m4 := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(ptrm) + 64)))
+
+ masks := [128]uint8{}
+ counts := [128]uint8{}
+ m1m3 := m1.And(m3).AsUint8x64()
+ m2m4 := m2.And(m4).AsUint8x64()
+ m1m3.Store((*[64]uint8)(unsafe.Pointer(&masks[0])))
+ m2m4.Store((*[64]uint8)(unsafe.Pointer(&masks[64])))
+ m1m3.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[0])))
+ m2m4.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[64])))
+
+ for i := range 128 {
+ mv := masks[i]
+ if mv == 0 {
+ continue
+ }
+ m := simd.Mask64x8FromBits(mv)
+ ptrs := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(mem) + uintptr(i*64))))
+ ptrs.Compress(m).Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(count*8))))
+ count += int32(counts[i])
+ }
+ simd.ClearAVXUpperBits()
+ return
+}
+
var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
cpu.X86.HasAVX512BW &&
cpu.X86.HasGFNI &&
diff --git a/src/internal/runtime/gc/scan/scan_amd64_test.go b/src/internal/runtime/gc/scan/scan_amd64_test.go
index a914b4f..ee1d13c 100644
--- a/src/internal/runtime/gc/scan/scan_amd64_test.go
+++ b/src/internal/runtime/gc/scan/scan_amd64_test.go
@@ -17,3 +17,10 @@
}
testScanSpanPacked(t, scan.ScanSpanPackedAVX512)
}
+
+func TestScanSpanPackedAVX512SIMD(t *testing.T) {
+ if !scan.CanAVX512() {
+ t.Skip("no AVX512")
+ }
+ testScanSpanPacked(t, scan.ScanSpanPackedAVX512SIMD)
+}
diff --git a/src/internal/runtime/gc/scan/scan_generic.go b/src/internal/runtime/gc/scan/scan_generic.go
index a4d5182..2ba0e95 100644
--- a/src/internal/runtime/gc/scan/scan_generic.go
+++ b/src/internal/runtime/gc/scan/scan_generic.go
@@ -21,3 +21,6 @@
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
}
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+ return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
+}
diff --git a/src/internal/runtime/gc/scan/scan_test.go b/src/internal/runtime/gc/scan/scan_test.go
index 14a0f6f..fc5e97b 100644
--- a/src/internal/runtime/gc/scan/scan_test.go
+++ b/src/internal/runtime/gc/scan/scan_test.go
@@ -203,6 +203,13 @@
scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
+ b.Run("impl=PlatformAsm", func(b *testing.B) {
+ b.SetBytes(avgBytes)
+ for i := range b.N {
+ page := pageOrder[i%len(pageOrder)]
+ scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+ }
+ })
}
})
}

Change information

Files:
  • M src/go/build/deps_test.go
  • M src/internal/runtime/gc/scan/expand_amd64.go
  • M src/internal/runtime/gc/scan/expand_amd64_test.go
  • M src/internal/runtime/gc/scan/expand_test.go
  • A src/internal/runtime/gc/scan/expanders_amd64.go
  • A src/internal/runtime/gc/scan/mkexpanders.go
  • M src/internal/runtime/gc/scan/scan_amd64.go
  • M src/internal/runtime/gc/scan/scan_amd64_test.go
  • M src/internal/runtime/gc/scan/scan_generic.go
  • M src/internal/runtime/gc/scan/scan_test.go
Change size: XL
Delta: 10 files changed, 2265 insertions(+), 3 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
Gerrit-Change-Number: 719520
Gerrit-PatchSet: 1
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

Junyang Shao (Gerrit)

unread,
Nov 11, 2025, 1:49:43 AM (2 days ago) Nov 11
to goph...@pubsubhelper.golang.org, Michael Knyszek, David Chase, golang-co...@googlegroups.com
Attention needed from David Chase and Michael Knyszek

Junyang Shao voted and added 1 comment

Votes added by Junyang Shao

Commit-Queue+1

1 comment

Patchset-level comments
File-level comment, Patchset 1 (Latest):
Junyang Shao . unresolved

This CL reworked CL 688415 for the comments from @mkny...@google.com.

I added the new generator, but the naming parts are not done yet(also the GOEXPERIMENT plumbing).

This CL will be updated

Open in Gerrit

Related details

Attention is currently required from:
  • David Chase
  • Michael Knyszek
Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement is not satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement is not satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: comment
    Gerrit-Project: go
    Gerrit-Branch: dev.simd
    Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
    Gerrit-Change-Number: 719520
    Gerrit-PatchSet: 1
    Gerrit-Owner: Junyang Shao <shaoj...@google.com>
    Gerrit-Reviewer: David Chase <drc...@google.com>
    Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
    Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
    Gerrit-Attention: David Chase <drc...@google.com>
    Gerrit-Attention: Michael Knyszek <mkny...@google.com>
    Gerrit-Comment-Date: Tue, 11 Nov 2025 06:49:38 +0000
    Gerrit-HasComments: Yes
    Gerrit-Has-Labels: Yes
    unsatisfied_requirement
    open
    diffy

    Junyang Shao (Gerrit)

    unread,
    Nov 11, 2025, 2:22:27 AM (2 days ago) Nov 11
    to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
    Attention needed from David Chase and Michael Knyszek

    Junyang Shao uploaded new patchset

    Junyang Shao uploaded patch set #2 to this change.
    Following approvals got outdated and were removed:
    • TryBots-Pass: LUCI-TryBot-Result+1 by Go LUCI
    Open in Gerrit

    Related details

    Attention is currently required from:
    • David Chase
    • Michael Knyszek
    Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement is not satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement is not satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: newpatchset
    Gerrit-Project: go
    Gerrit-Branch: dev.simd
    Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
    Gerrit-Change-Number: 719520
    Gerrit-PatchSet: 2
    Gerrit-Owner: Junyang Shao <shaoj...@google.com>
    Gerrit-Reviewer: David Chase <drc...@google.com>
    unsatisfied_requirement
    open
    diffy

    Junyang Shao (Gerrit)

    unread,
    Nov 11, 2025, 2:37:23 AM (2 days ago) Nov 11
    to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
    Attention needed from David Chase and Michael Knyszek

    Junyang Shao uploaded new patchset

    Junyang Shao uploaded patch set #3 to this change.
    Open in Gerrit

    Related details

    Attention is currently required from:
    • David Chase
    • Michael Knyszek
    Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement is not satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement is not satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: newpatchset
    Gerrit-Project: go
    Gerrit-Branch: dev.simd
    Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
    Gerrit-Change-Number: 719520
    Gerrit-PatchSet: 3
    unsatisfied_requirement
    open
    diffy

    Junyang Shao (Gerrit)

    unread,
    Nov 11, 2025, 2:39:01 AM (2 days ago) Nov 11
    to goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, David Chase, golang-co...@googlegroups.com
    Attention needed from David Chase and Michael Knyszek

    Junyang Shao voted and added 1 comment

    Votes added by Junyang Shao

    Commit-Queue+1

    1 comment

    Patchset-level comments
    File-level comment, Patchset 1:
    Junyang Shao . resolved

    This CL reworked CL 688415 for the comments from @mkny...@google.com.

    I added the new generator, but the naming parts are not done yet(also the GOEXPERIMENT plumbing).

    This CL will be updated

    Junyang Shao

    Done.

    One note: I am not sure how to do PCALIGN in Go code, so I left a TODO there.
    Apart from this as far as I know the Go SIMD code should be equivalent to Asm codes...

    Open in Gerrit

    Related details

    Attention is currently required from:
    • David Chase
    • Michael Knyszek
    Submit Requirements:
      • requirement is not satisfiedCode-Review
      • requirement satisfiedNo-Unresolved-Comments
      • requirement is not satisfiedReview-Enforcement
      • requirement is not satisfiedTryBots-Pass
      Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
      Gerrit-MessageType: comment
      Gerrit-Project: go
      Gerrit-Branch: dev.simd
      Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
      Gerrit-Change-Number: 719520
      Gerrit-PatchSet: 3
      Gerrit-Owner: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: David Chase <drc...@google.com>
      Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
      Gerrit-Attention: David Chase <drc...@google.com>
      Gerrit-Attention: Michael Knyszek <mkny...@google.com>
      Gerrit-Comment-Date: Tue, 11 Nov 2025 07:38:57 +0000
      Gerrit-HasComments: Yes
      Gerrit-Has-Labels: Yes
      Comment-In-Reply-To: Junyang Shao <shaoj...@google.com>
      unsatisfied_requirement
      satisfied_requirement
      open
      diffy

      Junyang Shao (Gerrit)

      unread,
      Nov 11, 2025, 2:47:31 AM (2 days ago) Nov 11
      to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
      Attention needed from David Chase, Junyang Shao and Michael Knyszek

      Junyang Shao uploaded new patchset

      Junyang Shao uploaded patch set #4 to this change.
      Following approvals got outdated and were removed:
      • TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI
      Open in Gerrit

      Related details

      Attention is currently required from:
      • David Chase
      • Junyang Shao
      • Michael Knyszek
      Submit Requirements:
      • requirement is not satisfiedCode-Review
      • requirement satisfiedNo-Unresolved-Comments
      • requirement is not satisfiedReview-Enforcement
      • requirement is not satisfiedTryBots-Pass
      Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
      Gerrit-MessageType: newpatchset
      Gerrit-Project: go
      Gerrit-Branch: dev.simd
      Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
      Gerrit-Change-Number: 719520
      Gerrit-PatchSet: 4
      Gerrit-Owner: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: David Chase <drc...@google.com>
      Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
      Gerrit-Attention: David Chase <drc...@google.com>
      Gerrit-Attention: Junyang Shao <shaoj...@google.com>
      Gerrit-Attention: Michael Knyszek <mkny...@google.com>
      unsatisfied_requirement
      satisfied_requirement
      open
      diffy

      Junyang Shao (Gerrit)

      unread,
      Nov 11, 2025, 2:47:38 AM (2 days ago) Nov 11
      to goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, David Chase, golang-co...@googlegroups.com
      Attention needed from David Chase and Michael Knyszek

      Junyang Shao voted Commit-Queue+1

      Commit-Queue+1
      Open in Gerrit

      Related details

      Attention is currently required from:
      • David Chase
      • Michael Knyszek
      Submit Requirements:
      • requirement is not satisfiedCode-Review
      • requirement satisfiedNo-Unresolved-Comments
      • requirement is not satisfiedReview-Enforcement
      • requirement is not satisfiedTryBots-Pass
      Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
      Gerrit-MessageType: comment
      Gerrit-Project: go
      Gerrit-Branch: dev.simd
      Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
      Gerrit-Change-Number: 719520
      Gerrit-PatchSet: 4
      Gerrit-Owner: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: David Chase <drc...@google.com>
      Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
      Gerrit-Attention: David Chase <drc...@google.com>
      Gerrit-Attention: Michael Knyszek <mkny...@google.com>
      Gerrit-Comment-Date: Tue, 11 Nov 2025 07:47:33 +0000
      Gerrit-HasComments: No
      Gerrit-Has-Labels: Yes
      unsatisfied_requirement
      satisfied_requirement
      open
      diffy

      Junyang Shao (Gerrit)

      unread,
      Nov 11, 2025, 3:09:55 AM (2 days ago) Nov 11
      to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
      Attention needed from David Chase and Michael Knyszek

      Junyang Shao uploaded new patchset

      Junyang Shao uploaded patch set #5 to this change.
      Open in Gerrit

      Related details

      Attention is currently required from:
      • David Chase
      • Michael Knyszek
      Submit Requirements:
        • requirement is not satisfiedCode-Review
        • requirement satisfiedNo-Unresolved-Comments
        • requirement is not satisfiedReview-Enforcement
        • requirement satisfiedTryBots-Pass
        Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
        Gerrit-MessageType: newpatchset
        Gerrit-Project: go
        Gerrit-Branch: dev.simd
        Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
        Gerrit-Change-Number: 719520
        Gerrit-PatchSet: 5
        unsatisfied_requirement
        satisfied_requirement
        open
        diffy

        Michael Knyszek (Gerrit)

        unread,
        Nov 11, 2025, 1:53:46 PM (2 days ago) Nov 11
        to Junyang Shao, goph...@pubsubhelper.golang.org, Go LUCI, David Chase, golang-co...@googlegroups.com
        Attention needed from David Chase and Junyang Shao

        Michael Knyszek added 14 comments

        Patchset-level comments
        File-level comment, Patchset 5 (Latest):
        Michael Knyszek . resolved

        nice job! looks good overall, mostly comments about naming and adding more comments

        Commit Message
        Line 7, Patchset 5 (Latest):[dev.simd] runtime/gc: add simd package based greentea kernels
        Michael Knyszek . unresolved

        internal/runtime/gc

        Line 9, Patchset 5 (Latest):This CL adds a new generator to runtime/gc/scan that generates expander
        Michael Knyszek . unresolved

        internal/runtime/gc/scan

        Line 16, Patchset 5 (Latest):The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
        in BenchmarkScanSpanPacked.
        Michael Knyszek . unresolved

        is this mostly coming from the FilterNil kernels? I imagine the rest of it should be identical.

        File src/internal/runtime/gc/scan/mkexpanders.go
        Line 6, Patchset 5 (Latest):// assemblies, this file generates Go code using the simd
        Michael Knyszek . unresolved

        nit: "that uses the simd package." I don't think we need to mention the GOEXPERIMENT, it's just another comment that's going to get stale once we remove the GOEXPERIMENT.

        Line 6, Patchset 5 (Latest):// assemblies, this file generates Go code using the simd
        Michael Knyszek . unresolved

        nit: assembly code

        Line 62, Patchset 5 (Latest):type expandersListData struct {
        Michael Knyszek . unresolved

        nit: I think Data is redunant here. you can just call this expandersList, or even better, just "expanders."

        Line 63, Patchset 5 (Latest): NumFuncs int
        Michael Knyszek . unresolved

        this is redundant. can't we just use the length of Funcs? I'm pretty sure you can get a slice value's length in a template.

        (better yet, perhaps this should just be `type expanders []string`?)

        Line 67, Patchset 5 (Latest):type expanderDataData struct {
        Michael Knyszek . unresolved

        I think you can just call this 'expanderData'. AFAICT this is just for the global tables?

        on that note, this could use a comment explaining what it is, and a comment on the fields explaining what they do.

        Line 72, Patchset 5 (Latest):type expanderData struct {
        Michael Knyszek . unresolved

        maybe call this 'expanderFunc' or just 'expander'?

        Line 73, Patchset 5 (Latest): Name string
        BodyLoad string
        Body string
        Michael Knyszek . unresolved

        comments on these fields would be very helpful.

        Line 77, Patchset 5 (Latest): dataV2N map[string]string
        Michael Knyszek . unresolved

        maybe "dataByVals"?

        Line 321, Patchset 5 (Latest):// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
        //
        // The output is
        //
        // Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
        // Z2 [64]uint8 = The top 512 bits of the expanded bitmap
        //
        // TODO(austin): This should Z0/Z1.
        Michael Knyszek . unresolved

        this comment needs an update, you're not selecting registers anymore

        File src/internal/runtime/gc/scan/scan_generic.go
        Line 25, Patchset 5 (Latest): return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
        Michael Knyszek . unresolved

        IMO, just make this a panic and have the tests skip if we're not on amd64. (maybe make a new global constant that indicates what platforms ScanSpanPackedAsm is supported on.)

        I think the current situation is a little misleading.

        Open in Gerrit

        Related details

        Attention is currently required from:
        • David Chase
        • Junyang Shao
        Submit Requirements:
          • requirement is not satisfiedCode-Review
          • requirement is not satisfiedNo-Unresolved-Comments
          • requirement is not satisfiedReview-Enforcement
          • requirement satisfiedTryBots-Pass
          Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
          Gerrit-MessageType: comment
          Gerrit-Project: go
          Gerrit-Branch: dev.simd
          Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
          Gerrit-Change-Number: 719520
          Gerrit-PatchSet: 5
          Gerrit-Owner: Junyang Shao <shaoj...@google.com>
          Gerrit-Reviewer: David Chase <drc...@google.com>
          Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
          Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
          Gerrit-Attention: David Chase <drc...@google.com>
          Gerrit-Attention: Junyang Shao <shaoj...@google.com>
          Gerrit-Comment-Date: Tue, 11 Nov 2025 18:53:41 +0000
          Gerrit-HasComments: Yes
          Gerrit-Has-Labels: No
          unsatisfied_requirement
          satisfied_requirement
          open
          diffy

          Michael Knyszek (Gerrit)

          unread,
          Nov 11, 2025, 1:54:37 PM (2 days ago) Nov 11
          to Junyang Shao, goph...@pubsubhelper.golang.org, Go LUCI, David Chase, golang-co...@googlegroups.com
          Attention needed from David Chase and Junyang Shao

          Michael Knyszek added 1 comment


          The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
          in BenchmarkScanSpanPacked.
          Michael Knyszek . unresolved

          please add the benchstat output directly in the commit message, like we do for some other commits. you don't need to include all the results, a small sample is fine.

          Gerrit-Comment-Date: Tue, 11 Nov 2025 18:54:34 +0000
          Gerrit-HasComments: Yes
          Gerrit-Has-Labels: No
          unsatisfied_requirement
          satisfied_requirement
          open
          diffy

          Junyang Shao (Gerrit)

          unread,
          Nov 11, 2025, 5:08:52 PM (2 days ago) Nov 11
          to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
          Attention needed from David Chase and Junyang Shao

          Junyang Shao uploaded new patchset

          Junyang Shao uploaded patch set #6 to this change.
          Following approvals got outdated and were removed:
          • TryBots-Pass: LUCI-TryBot-Result+1 by Go LUCI
          Open in Gerrit

          Related details

          Attention is currently required from:
          • David Chase
          • Junyang Shao
          Submit Requirements:
            • requirement is not satisfiedCode-Review
            • requirement is not satisfiedNo-Unresolved-Comments
            • requirement is not satisfiedReview-Enforcement
            • requirement is not satisfiedTryBots-Pass
            Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
            Gerrit-MessageType: newpatchset
            Gerrit-Project: go
            Gerrit-Branch: dev.simd
            Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
            Gerrit-Change-Number: 719520
            Gerrit-PatchSet: 6
            unsatisfied_requirement
            open
            diffy

            Junyang Shao (Gerrit)

            unread,
            Nov 11, 2025, 5:09:02 PM (2 days ago) Nov 11
            to goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, David Chase, golang-co...@googlegroups.com
            Attention needed from David Chase and Michael Knyszek

            Junyang Shao voted and added 14 comments

            Votes added by Junyang Shao

            Commit-Queue+1

            14 comments

            Commit Message
            Line 7, Patchset 5:[dev.simd] runtime/gc: add simd package based greentea kernels
            Michael Knyszek . resolved

            internal/runtime/gc

            Junyang Shao

            Done

            Line 9, Patchset 5:This CL adds a new generator to runtime/gc/scan that generates expander
            Michael Knyszek . resolved

            internal/runtime/gc/scan

            Junyang Shao

            Done


            The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
            in BenchmarkScanSpanPacked.
            Michael Knyszek . resolved

            please add the benchstat output directly in the commit message, like we do for some other commits. you don't need to include all the results, a small sample is fine.

            Junyang Shao

            Done

            Line 16, Patchset 5:The Go SIMD kernels on average is -11% in ns/op compared to Asm kernels
            in BenchmarkScanSpanPacked.
            Michael Knyszek . resolved

            is this mostly coming from the FilterNil kernels? I imagine the rest of it should be identical.

            Junyang Shao

            I guess so. Some more context of the benchmarkings I did:
            The benchmark in this CL description is comparing:
            (1) FilterNilAVX512 + Go SIMD expanders/scanners
            (2) FilterNil + Asm expanders/scanners
            And the result is (1) is +25% on throughput compared to (2).

            I did this before on the older CL:
            (3) FilterNil + Go SIMD expanders/scanners
            (4) FilterNil + Asm expanders/scanners
            The throughput were roughly the same.

            But since then we had added many optimizations to the SIMD branch, and on the older CL, (1) is only +10% to (2). Now since (2) is +25% to (1), it might be possible that (3) is also strictly better than (4).

            I will run another benchmark later. :D

            File src/internal/runtime/gc/scan/mkexpanders.go
            Line 6, Patchset 5:// assemblies, this file generates Go code using the simd
            Michael Knyszek . resolved

            nit: "that uses the simd package." I don't think we need to mention the GOEXPERIMENT, it's just another comment that's going to get stale once we remove the GOEXPERIMENT.

            Junyang Shao

            Done

            Line 6, Patchset 5:// assemblies, this file generates Go code using the simd
            Michael Knyszek . resolved

            nit: assembly code

            Junyang Shao

            Done

            Line 62, Patchset 5:type expandersListData struct {
            Michael Knyszek . resolved

            nit: I think Data is redunant here. you can just call this expandersList, or even better, just "expanders."

            Junyang Shao

            Done

            Line 63, Patchset 5: NumFuncs int
            Michael Knyszek . resolved

            this is redundant. can't we just use the length of Funcs? I'm pretty sure you can get a slice value's length in a template.

            (better yet, perhaps this should just be `type expanders []string`?)

            Junyang Shao

            We probably just don't need that type, updated. Thanks!

            Line 67, Patchset 5:type expanderDataData struct {
            Michael Knyszek . resolved

            I think you can just call this 'expanderData'. AFAICT this is just for the global tables?

            on that note, this could use a comment explaining what it is, and a comment on the fields explaining what they do.

            Junyang Shao

            Done

            Line 72, Patchset 5:type expanderData struct {
            Michael Knyszek . resolved

            maybe call this 'expanderFunc' or just 'expander'?

            Junyang Shao

            Done

            Line 73, Patchset 5: Name string
            BodyLoad string
            Body string
            Michael Knyszek . resolved

            comments on these fields would be very helpful.

            Junyang Shao

            Done

            Line 77, Patchset 5: dataV2N map[string]string
            Michael Knyszek . resolved

            maybe "dataByVals"?

            Junyang Shao

            Done

            Line 321, Patchset 5:// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)

            //
            // The output is
            //
            // Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
            // Z2 [64]uint8 = The top 512 bits of the expanded bitmap
            //
            // TODO(austin): This should Z0/Z1.
            Michael Knyszek . resolved

            this comment needs an update, you're not selecting registers anymore

            Junyang Shao

            Done

            File src/internal/runtime/gc/scan/scan_generic.go
            Line 25, Patchset 5: return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
            Michael Knyszek . resolved

            IMO, just make this a panic and have the tests skip if we're not on amd64. (maybe make a new global constant that indicates what platforms ScanSpanPackedAsm is supported on.)

            I think the current situation is a little misleading.

            Junyang Shao

            I think both these 2 functions are guarded by `HasFastScanSpanPacked` and they will never actually be called, I made them both panic.

            Open in Gerrit

            Related details

            Attention is currently required from:
            • David Chase
            • Michael Knyszek
            Submit Requirements:
              • requirement is not satisfiedCode-Review
              • requirement satisfiedNo-Unresolved-Comments
              • requirement is not satisfiedReview-Enforcement
              • requirement is not satisfiedTryBots-Pass
              Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
              Gerrit-MessageType: comment
              Gerrit-Project: go
              Gerrit-Branch: dev.simd
              Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
              Gerrit-Change-Number: 719520
              Gerrit-PatchSet: 6
              Gerrit-Owner: Junyang Shao <shaoj...@google.com>
              Gerrit-Reviewer: David Chase <drc...@google.com>
              Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
              Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
              Gerrit-Attention: David Chase <drc...@google.com>
              Gerrit-Attention: Michael Knyszek <mkny...@google.com>
              Gerrit-Comment-Date: Tue, 11 Nov 2025 22:08:59 +0000
              Gerrit-HasComments: Yes
              Gerrit-Has-Labels: Yes
              Comment-In-Reply-To: Michael Knyszek <mkny...@google.com>
              unsatisfied_requirement
              satisfied_requirement
              open
              diffy

              David Chase (Gerrit)

              unread,
              Nov 11, 2025, 5:44:03 PM (2 days ago) Nov 11
              to Junyang Shao, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, golang-co...@googlegroups.com
              Attention needed from Junyang Shao and Michael Knyszek

              David Chase added 1 comment

              File src/internal/runtime/gc/scan/scan_amd64.go
              Line 13, Patchset 6 (Latest): "simd"
              David Chase . unresolved

              I'm confused, doesn't this need a build goexperiment.simd build tag?

              Open in Gerrit

              Related details

              Attention is currently required from:
              • Junyang Shao
              • Michael Knyszek
              Submit Requirements:
                • requirement is not satisfiedCode-Review
                • requirement is not satisfiedNo-Unresolved-Comments
                • requirement is not satisfiedReview-Enforcement
                • requirement is not satisfiedTryBots-Pass
                Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
                Gerrit-MessageType: comment
                Gerrit-Project: go
                Gerrit-Branch: dev.simd
                Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
                Gerrit-Change-Number: 719520
                Gerrit-PatchSet: 6
                Gerrit-Owner: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: David Chase <drc...@google.com>
                Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
                Gerrit-Attention: Junyang Shao <shaoj...@google.com>
                Gerrit-Attention: Michael Knyszek <mkny...@google.com>
                Gerrit-Comment-Date: Tue, 11 Nov 2025 22:43:59 +0000
                Gerrit-HasComments: Yes
                Gerrit-Has-Labels: No
                unsatisfied_requirement
                open
                diffy

                Junyang Shao (Gerrit)

                unread,
                Nov 12, 2025, 11:02:56 AM (18 hours ago) Nov 12
                to goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, David Chase, golang-co...@googlegroups.com
                Attention needed from David Chase and Michael Knyszek

                Junyang Shao added 1 comment

                File src/internal/runtime/gc/scan/scan_amd64.go
                David Chase . unresolved

                I'm confused, doesn't this need a build goexperiment.simd build tag?

                Junyang Shao

                Ohh you might be right, maybe its's becasue simd experiment is always on in this branch so the compilation doesn't fail.

                I will update...

                Open in Gerrit

                Related details

                Attention is currently required from:
                • David Chase
                • Michael Knyszek
                Submit Requirements:
                • requirement is not satisfiedCode-Review
                • requirement is not satisfiedNo-Unresolved-Comments
                • requirement is not satisfiedReview-Enforcement
                • requirement is not satisfiedTryBots-Pass
                Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
                Gerrit-MessageType: comment
                Gerrit-Project: go
                Gerrit-Branch: dev.simd
                Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
                Gerrit-Change-Number: 719520
                Gerrit-PatchSet: 6
                Gerrit-Owner: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: David Chase <drc...@google.com>
                Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
                Gerrit-Attention: David Chase <drc...@google.com>
                Gerrit-Attention: Michael Knyszek <mkny...@google.com>
                Gerrit-Comment-Date: Wed, 12 Nov 2025 16:02:40 +0000
                Gerrit-HasComments: Yes
                Gerrit-Has-Labels: No
                Comment-In-Reply-To: David Chase <drc...@google.com>
                unsatisfied_requirement
                open
                diffy

                David Chase (Gerrit)

                unread,
                Nov 12, 2025, 11:24:27 AM (18 hours ago) Nov 12
                to Junyang Shao, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, golang-co...@googlegroups.com
                Attention needed from Junyang Shao and Michael Knyszek

                David Chase added 1 comment

                File src/internal/runtime/gc/scan/scan_amd64.go
                David Chase . unresolved

                I'm confused, doesn't this need a build goexperiment.simd build tag?

                Junyang Shao

                Ohh you might be right, maybe its's becasue simd experiment is always on in this branch so the compilation doesn't fail.

                I will update...

                David Chase

                This will be a general issue before the big merge.

                Open in Gerrit

                Related details

                Attention is currently required from:
                • Junyang Shao
                • Michael Knyszek
                Submit Requirements:
                • requirement is not satisfiedCode-Review
                • requirement is not satisfiedNo-Unresolved-Comments
                • requirement is not satisfiedReview-Enforcement
                • requirement is not satisfiedTryBots-Pass
                Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
                Gerrit-MessageType: comment
                Gerrit-Project: go
                Gerrit-Branch: dev.simd
                Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
                Gerrit-Change-Number: 719520
                Gerrit-PatchSet: 6
                Gerrit-Owner: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: David Chase <drc...@google.com>
                Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
                Gerrit-Attention: Junyang Shao <shaoj...@google.com>
                Gerrit-Attention: Michael Knyszek <mkny...@google.com>
                Gerrit-Comment-Date: Wed, 12 Nov 2025 16:24:22 +0000
                Gerrit-HasComments: Yes
                Gerrit-Has-Labels: No
                Comment-In-Reply-To: David Chase <drc...@google.com>
                Comment-In-Reply-To: Junyang Shao <shaoj...@google.com>
                unsatisfied_requirement
                open
                diffy

                Michael Knyszek (Gerrit)

                unread,
                Nov 12, 2025, 11:45:48 AM (17 hours ago) Nov 12
                to Junyang Shao, goph...@pubsubhelper.golang.org, Go LUCI, David Chase, golang-co...@googlegroups.com
                Attention needed from Junyang Shao

                Michael Knyszek voted and added 5 comments

                Votes added by Michael Knyszek

                Code-Review+2

                5 comments

                File src/internal/runtime/gc/scan/expand_amd64.s
                File-level comment, Patchset 6 (Latest):
                Michael Knyszek . unresolved

                nit: can you rename this file to `expanders_amd64.s`? just to match your new file.

                File src/internal/runtime/gc/scan/mkexpanders.go
                Line 69, Patchset 6 (Latest):// expander is the expander function, it only provides 3 kinds of values:
                Michael Knyszek . unresolved

                nit: "it only operates on 3 kinds of values" maybe?

                Line 165, Patchset 6 (Latest): fn.Body += fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix)
                Michael Knyszek . unresolved

                this has O(n^2) behavior since we're appending to a string, forcing an O(n) copy of the string on every append. this can be more efficient, and perhaps be written more cleanly, if `fn.Body` was a `strings.Builder`. then instead of `+= fmt.Sprintf(...)` you would do `fmt.Fprintf(&fn.Body, ...)`.

                this is just a generator so the performance doesn't really matter, but it does nag at my brain... 😄

                if you don't feel like changing this, feel free to just acknowledge this comment and move on.

                File src/internal/runtime/gc/scan/scan_generic.go
                Line 22, Patchset 6 (Latest): panic("should not reach")
                Michael Knyszek . unresolved

                ah, sorry, I think ScanSpanPacked should still call ScapSpanPackedGo in the generic implementation. I just think the assembly one should crash.

                Line 25, Patchset 6 (Latest): panic("should not reach")
                Michael Knyszek . unresolved

                nit: we usually write "not implemented" here

                Open in Gerrit

                Related details

                Attention is currently required from:
                • Junyang Shao
                Submit Requirements:
                • requirement satisfiedCode-Review
                • requirement is not satisfiedNo-Unresolved-Comments
                • requirement satisfiedReview-Enforcement
                • requirement is not satisfiedTryBots-Pass
                Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
                Gerrit-MessageType: comment
                Gerrit-Project: go
                Gerrit-Branch: dev.simd
                Gerrit-Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
                Gerrit-Change-Number: 719520
                Gerrit-PatchSet: 6
                Gerrit-Owner: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: David Chase <drc...@google.com>
                Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
                Gerrit-Reviewer: Michael Knyszek <mkny...@google.com>
                Gerrit-Attention: Junyang Shao <shaoj...@google.com>
                Gerrit-Comment-Date: Wed, 12 Nov 2025 16:45:42 +0000
                Gerrit-HasComments: Yes
                Gerrit-Has-Labels: Yes
                satisfied_requirement
                unsatisfied_requirement
                open
                diffy
                Reply all
                Reply to author
                Forward
                0 new messages