David Chase (Gerrit)

unread,

Jun 5, 2026, 4:32:18 PM (5 days ago) Jun 5

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

David Chase has uploaded the change for review

Commit message

simd: add some inner-product benchmarks

Some hand-unrolling of important methods
was tried, and shown to improve results.

Change-Id: I09b912a3b68431972eeb91200f77a3dd95327d91

Change diff

diff --git a/src/simd/internal/bridge/simd_emulated.go b/src/simd/internal/bridge/simd_emulated.go
index ba4d1b2..58527f88 100644
--- a/src/simd/internal/bridge/simd_emulated.go
+++ b/src/simd/internal/bridge/simd_emulated.go
@@ -2537,9 +2537,10 @@
 // Add returns the element-wise sum of x and y.
 func (x Float32s) Add(y Float32s) Float32s {
 	var res Float32s
-	for i := 0; i < 4; i++ {
-		res.set(i, x.get(i)+y.get(i))
-	}
+	res.set(0, x.get(0)+y.get(0))
+	res.set(1, x.get(1)+y.get(1))
+	res.set(2, x.get(2)+y.get(2))
+	res.set(3, x.get(3)+y.get(3))
 	return res
 }
 
@@ -2667,18 +2668,21 @@
 // Mul returns the element-wise product of x and y.
 func (x Float32s) Mul(y Float32s) Float32s {
 	var res Float32s
-	for i := 0; i < 4; i++ {
-		res.set(i, x.get(i)*y.get(i))
-	}
+	res.set(0, x.get(0)*y.get(0))
+	res.set(1, x.get(1)*y.get(1))
+	res.set(2, x.get(2)*y.get(2))
+	res.set(3, x.get(3)*y.get(3))
+
 	return res
 }
 
 // MulAdd returns x * y + z element-wise.
 func (x Float32s) MulAdd(y, z Float32s) Float32s {
 	var res Float32s
-	for i := 0; i < 4; i++ {
-		res.set(i, x.get(i)+y.get(i)*z.get(i))
-	}
+	res.set(0, x.get(0)+y.get(0)*z.get(0))
+	res.set(1, x.get(1)+y.get(1)*z.get(1))
+	res.set(2, x.get(2)+y.get(2)*z.get(2))
+	res.set(3, x.get(3)+y.get(3)*z.get(3))
 	return res
 }
 
diff --git a/src/simd/ip_test.go b/src/simd/ip_test.go
index 90aea58..ebd8b4f 100644
--- a/src/simd/ip_test.go
+++ b/src/simd/ip_test.go
@@ -8,6 +8,7 @@
 
 import (
 	"fmt"
+	"math/rand/v2"
 	"simd"
 	"testing"
 )
@@ -48,6 +49,225 @@
 	return t
 }
 
+const ipBenchLen = 300000
+
+// BenchmarkIP is simd vector inner product, vanilla transcription.
+func BenchmarkIP(b *testing.B) {
+	x := make([]float32, ipBenchLen)
+	y := make([]float32, ipBenchLen)
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0, _, _ := ip(x, y)
+
+	var errors int
+	for b.Loop() {
+		z, _, _ := ip(x, y)
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+// BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops.
+func BenchmarkIPUnroll(b *testing.B) {
+	x := make([]float32, ipBenchLen)
+	y := make([]float32, ipBenchLen)
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0, _, _ := ipU(x, y)
+
+	var errors int
+	for b.Loop() {
+		z, _, _ := ipU(x, y)
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+// BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops
+func BenchmarkIPUnrollMore(b *testing.B) {
+	x := make([]float32, ipBenchLen)
+	y := make([]float32, ipBenchLen)
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0, _, _ := ipUmore(x, y)
+
+	var errors int
+	for b.Loop() {
+		z, _, _ := ipUmore(x, y)
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+// BenchmarkIPFMA is simd vector inner product computing using FMA.
+func BenchmarkIPFMA(b *testing.B) {
+	x := make([]float32, ipBenchLen)
+	y := make([]float32, ipBenchLen)
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0, _, _ := ipFMA(x, y)
+
+	var errors int
+	for b.Loop() {
+		z, _, _ := ipFMA(x, y)
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+// ipNosimd computes inner product with serial
+// addition order of the terms (to make the)
+// check comparison turn out right.
+func ipNosimd(x, y []float32) float32 {
+	var z float32
+	for i, a := range x {
+		z += a * y[i]
+	}
+	return z
+}
+
+// BenchmarkIPnosimd1 is serial, just a vanilla inner product.
+func BenchmarkIPnosimd0(b *testing.B) {
+	x := make([]float32, ipBenchLen)
+	y := make([]float32, ipBenchLen)
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0 := ipNosimd(x, y)
+
+	var errors int
+	for b.Loop() {
+		var z float32
+		for i, a := range x {
+			z += a * y[i]
+		}
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+// BenchmarkIPnosimd1 is serial, but with a no-op subslice that
+// makes it clear that x and y have the same length.
+func BenchmarkIPnosimd1(b *testing.B) {
+	x := make([]float32, ipBenchLen)
+	y := make([]float32, ipBenchLen)
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0 := ipNosimd(x, y)
+
+	var errors int
+	for b.Loop() {
+		var z float32
+		yy := y[:(len(x))]
+		for i, a := range x {
+			z += a * yy[i]
+		}
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+// BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices,
+// so no bounds checking, gosh darn it to heck.
+func BenchmarkIPnosimdA(b *testing.B) {
+	var x, y [ipBenchLen]float32
+
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0 := ipNosimd(x[:], y[:])
+
+	var errors int
+	for b.Loop() {
+		var z float32
+		for i, a := range x {
+			z += a * y[i]
+		}
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
+var x, y [ipBenchLen]float32
+var ip0 float32
+
+func initIp0() {
+	for i := range x {
+		x[i] = 2*rand.Float32() - 1
+		y[i] = 2*rand.Float32() - 1
+	}
+	ip0 = ipNosimd(x[:], y[:])
+}
+
+// BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices,
+// and using a classic iterated loop to see if b.Loop affects subscript inference,
+// so no bounds checking, gosh darn it to heck, this time, for sure.
+func BenchmarkIPnosimdAnotBloop(b *testing.B) {
+	if ip0 == 0 {
+		initIp0()
+	}
+
+	var errors int
+	for range b.N {
+		var z float32
+		for i, a := range x {
+			z += a * y[i]
+		}
+		if z != ip0 {
+			errors++
+		}
+	}
+	if errors > 0 {
+		b.Logf("errors = %d\n", errors)
+	}
+}
+
 func ip(x, y []float32) (float32, int, bool) {
 	var a simd.Float32s
 	sumWidth := a.Len() * 32
@@ -66,6 +286,114 @@
 	return sum(a), sumWidth, emulated
 }
 
+func ipU(x, y []float32) (float32, int, bool) {
+	const U = 4
+	var a, a0, a1, a2, a3 simd.Float32s
+	sumWidth := a.Len() * 32
+	emulated := simd.Emulated()
+	var i int
+	for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
+		i0 := i
+		i1 := i + a.Len()
+		i2 := i + 2*a.Len()
+		i3 := i + 3*a.Len()
+
+		u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
+		v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
+		a0 = a0.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
+		v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
+		a1 = a1.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
+		v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
+		a2 = a2.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
+		v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
+		a3 = a3.Add(u.Mul(v))
+	}
+	a = a0.Add(a1).Add(a2.Add(a3))
+	for ; i < len(x)-a.Len()+1; i += a.Len() {
+		u := simd.LoadFloat32s(x[i : i+a.Len()])
+		v := simd.LoadFloat32s(y[i : i+a.Len()])
+		a = a.Add(u.Mul(v))
+	}
+	if i < len(x) {
+		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
+			Mul(first(simd.LoadFloat32sPart(y[i:]))))
+	}
+
+	return sum(a), sumWidth, emulated
+}
+
+func ipUmore(x, y []float32) (float32, int, bool) {
+	const U = 5
+	var a, a0, a1, a2, a3, a4 simd.Float32s
+	sumWidth := a.Len() * 32
+	emulated := simd.Emulated()
+	var i int
+	for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
+		i0 := i
+		i1 := i + a.Len()
+		i2 := i + 2*a.Len()
+		i3 := i + 3*a.Len()
+		i4 := i + 3*a.Len()
+
+		u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
+		v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
+		a0 = a0.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
+		v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
+		a1 = a1.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
+		v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
+		a2 = a2.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
+		v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
+		a3 = a3.Add(u.Mul(v))
+
+		u = simd.LoadFloat32s(x[i4 : i4+a.Len()])
+		v = simd.LoadFloat32s(y[i4 : i4+a.Len()])
+		a4 = a4.Add(u.Mul(v))
+	}
+	a = a0.Add(a1).Add(a2.Add(a3)).Add(a4)
+
+	for ; i < len(x)-a.Len()+1; i += a.Len() {
+		u := simd.LoadFloat32s(x[i : i+a.Len()])
+		v := simd.LoadFloat32s(y[i : i+a.Len()])
+		a = a.Add(u.Mul(v))
+	}
+	if i < len(x) {
+		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
+			Mul(first(simd.LoadFloat32sPart(y[i:]))))
+	}
+
+	return sum(a), sumWidth, emulated
+}
+
+func ipFMA(x, y []float32) (float32, int, bool) {
+	var a simd.Float32s
+	sumWidth := a.Len() * 32
+	emulated := simd.Emulated()
+	var i int
+	for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
+		u := simd.LoadFloat32s(x[i : i+a.Len()])
+		v := simd.LoadFloat32s(y[i : i+a.Len()])
+		a = u.MulAdd(v, a)
+	}
+	if i < len(x) {
+		a = first(simd.LoadFloat32sPart(x[i:])).MulAdd(
+			first(simd.LoadFloat32sPart(y[i:])), a)
+	}
+
+	return sum(a), sumWidth, emulated
+}
+
 func ipGoTo(x, y []float32) (float32, int, bool) {
 	var a simd.Float32s
 	sumWidth := a.Len() * 32
diff --git a/src/simd/simd_emulated.go b/src/simd/simd_emulated.go
index a962558..e705c97 100644
--- a/src/simd/simd_emulated.go
+++ b/src/simd/simd_emulated.go
@@ -2537,9 +2537,10 @@
 // Add returns the element-wise sum of x and y.
 func (x Float32s) Add(y Float32s) Float32s {
 	var res Float32s
-	for i := 0; i < 4; i++ {
-		res.set(i, x.get(i)+y.get(i))
-	}
+	res.set(0, x.get(0)+y.get(0))
+	res.set(1, x.get(1)+y.get(1))
+	res.set(2, x.get(2)+y.get(2))
+	res.set(3, x.get(3)+y.get(3))
 	return res
 }
 
@@ -2667,18 +2668,21 @@
 // Mul returns the element-wise product of x and y.
 func (x Float32s) Mul(y Float32s) Float32s {
 	var res Float32s
-	for i := 0; i < 4; i++ {
-		res.set(i, x.get(i)*y.get(i))
-	}
+	res.set(0, x.get(0)*y.get(0))
+	res.set(1, x.get(1)*y.get(1))
+	res.set(2, x.get(2)*y.get(2))
+	res.set(3, x.get(3)*y.get(3))
+
 	return res
 }
 
 // MulAdd returns x * y + z element-wise.
 func (x Float32s) MulAdd(y, z Float32s) Float32s {
 	var res Float32s
-	for i := 0; i < 4; i++ {
-		res.set(i, x.get(i)+y.get(i)*z.get(i))
-	}
+	res.set(0, x.get(0)+y.get(0)*z.get(0))
+	res.set(1, x.get(1)+y.get(1)*z.get(1))
+	res.set(2, x.get(2)+y.get(2)*z.get(2))
+	res.set(3, x.get(3)+y.get(3)*z.get(3))
 	return res
 }

Change information

Files:

M src/simd/internal/bridge/simd_emulated.go
M src/simd/ip_test.go
M src/simd/simd_emulated.go

Change size: L

Delta: 3 files changed, 354 insertions(+), 18 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

David Chase (Gerrit)

unread,

Jun 5, 2026, 10:14:03 PM (4 days ago) Jun 5

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from David Chase

David Chase uploaded new patchset

David Chase uploaded patch set #2 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by golang...@luci-project-accounts.iam.gserviceaccount.com

Open in Gerrit

Related details

Attention is currently required from:

David Chase

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

David Chase (Gerrit)

unread,

Jun 9, 2026, 5:23:11 PM (12 hours ago) Jun 9

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from David Chase

David Chase uploaded new patchset

David Chase uploaded patch set #5 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result+1 by golang...@luci-project-accounts.iam.gserviceaccount.com

Open in Gerrit

Related details

Attention is currently required from:

David Chase

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Reply all

Reply to author

Forward

[go] simd: add some inner-product benchmarks

David Chase (Gerrit)

David Chase has uploaded the change for review

Commit message

Change diff

Change information

Related details

David Chase (Gerrit)

David Chase uploaded new patchset

Related details

David Chase (Gerrit)

David Chase uploaded new patchset

Related details