[go] simd: add some inner-product benchmarks

3 views
Skip to first unread message

David Chase (Gerrit)

unread,
Jun 5, 2026, 4:32:18 PM (5 days ago) Jun 5
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

David Chase has uploaded the change for review

Commit message

simd: add some inner-product benchmarks

Some hand-unrolling of important methods
was tried, and shown to improve results.
Change-Id: I09b912a3b68431972eeb91200f77a3dd95327d91

Change diff

diff --git a/src/simd/internal/bridge/simd_emulated.go b/src/simd/internal/bridge/simd_emulated.go
index ba4d1b2..58527f88 100644
--- a/src/simd/internal/bridge/simd_emulated.go
+++ b/src/simd/internal/bridge/simd_emulated.go
@@ -2537,9 +2537,10 @@
// Add returns the element-wise sum of x and y.
func (x Float32s) Add(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i))
- }
+ res.set(0, x.get(0)+y.get(0))
+ res.set(1, x.get(1)+y.get(1))
+ res.set(2, x.get(2)+y.get(2))
+ res.set(3, x.get(3)+y.get(3))
return res
}

@@ -2667,18 +2668,21 @@
// Mul returns the element-wise product of x and y.
func (x Float32s) Mul(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)*y.get(i))
- }
+ res.set(0, x.get(0)*y.get(0))
+ res.set(1, x.get(1)*y.get(1))
+ res.set(2, x.get(2)*y.get(2))
+ res.set(3, x.get(3)*y.get(3))
+
return res
}

// MulAdd returns x * y + z element-wise.
func (x Float32s) MulAdd(y, z Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i)*z.get(i))
- }
+ res.set(0, x.get(0)+y.get(0)*z.get(0))
+ res.set(1, x.get(1)+y.get(1)*z.get(1))
+ res.set(2, x.get(2)+y.get(2)*z.get(2))
+ res.set(3, x.get(3)+y.get(3)*z.get(3))
return res
}

diff --git a/src/simd/ip_test.go b/src/simd/ip_test.go
index 90aea58..ebd8b4f 100644
--- a/src/simd/ip_test.go
+++ b/src/simd/ip_test.go
@@ -8,6 +8,7 @@

import (
"fmt"
+ "math/rand/v2"
"simd"
"testing"
)
@@ -48,6 +49,225 @@
return t
}

+const ipBenchLen = 300000
+
+// BenchmarkIP is simd vector inner product, vanilla transcription.
+func BenchmarkIP(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ip(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ip(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops.
+func BenchmarkIPUnroll(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ipU(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ipU(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops
+func BenchmarkIPUnrollMore(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ipUmore(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ipUmore(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPFMA is simd vector inner product computing using FMA.
+func BenchmarkIPFMA(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ipFMA(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ipFMA(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// ipNosimd computes inner product with serial
+// addition order of the terms (to make the)
+// check comparison turn out right.
+func ipNosimd(x, y []float32) float32 {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ return z
+}
+
+// BenchmarkIPnosimd1 is serial, just a vanilla inner product.
+func BenchmarkIPnosimd0(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 := ipNosimd(x, y)
+
+ var errors int
+ for b.Loop() {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPnosimd1 is serial, but with a no-op subslice that
+// makes it clear that x and y have the same length.
+func BenchmarkIPnosimd1(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 := ipNosimd(x, y)
+
+ var errors int
+ for b.Loop() {
+ var z float32
+ yy := y[:(len(x))]
+ for i, a := range x {
+ z += a * yy[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices,
+// so no bounds checking, gosh darn it to heck.
+func BenchmarkIPnosimdA(b *testing.B) {
+ var x, y [ipBenchLen]float32
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 := ipNosimd(x[:], y[:])
+
+ var errors int
+ for b.Loop() {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+var x, y [ipBenchLen]float32
+var ip0 float32
+
+func initIp0() {
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 = ipNosimd(x[:], y[:])
+}
+
+// BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices,
+// and using a classic iterated loop to see if b.Loop affects subscript inference,
+// so no bounds checking, gosh darn it to heck, this time, for sure.
+func BenchmarkIPnosimdAnotBloop(b *testing.B) {
+ if ip0 == 0 {
+ initIp0()
+ }
+
+ var errors int
+ for range b.N {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
func ip(x, y []float32) (float32, int, bool) {
var a simd.Float32s
sumWidth := a.Len() * 32
@@ -66,6 +286,114 @@
return sum(a), sumWidth, emulated
}

+func ipU(x, y []float32) (float32, int, bool) {
+ const U = 4
+ var a, a0, a1, a2, a3 simd.Float32s
+ sumWidth := a.Len() * 32
+ emulated := simd.Emulated()
+ var i int
+ for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
+ i0 := i
+ i1 := i + a.Len()
+ i2 := i + 2*a.Len()
+ i3 := i + 3*a.Len()
+
+ u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
+ v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
+ a0 = a0.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
+ v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
+ a1 = a1.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
+ v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
+ a2 = a2.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
+ v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
+ a3 = a3.Add(u.Mul(v))
+ }
+ a = a0.Add(a1).Add(a2.Add(a3))
+ for ; i < len(x)-a.Len()+1; i += a.Len() {
+ u := simd.LoadFloat32s(x[i : i+a.Len()])
+ v := simd.LoadFloat32s(y[i : i+a.Len()])
+ a = a.Add(u.Mul(v))
+ }
+ if i < len(x) {
+ a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
+ Mul(first(simd.LoadFloat32sPart(y[i:]))))
+ }
+
+ return sum(a), sumWidth, emulated
+}
+
+func ipUmore(x, y []float32) (float32, int, bool) {
+ const U = 5
+ var a, a0, a1, a2, a3, a4 simd.Float32s
+ sumWidth := a.Len() * 32
+ emulated := simd.Emulated()
+ var i int
+ for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
+ i0 := i
+ i1 := i + a.Len()
+ i2 := i + 2*a.Len()
+ i3 := i + 3*a.Len()
+ i4 := i + 3*a.Len()
+
+ u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
+ v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
+ a0 = a0.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
+ v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
+ a1 = a1.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
+ v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
+ a2 = a2.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
+ v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
+ a3 = a3.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i4 : i4+a.Len()])
+ v = simd.LoadFloat32s(y[i4 : i4+a.Len()])
+ a4 = a4.Add(u.Mul(v))
+ }
+ a = a0.Add(a1).Add(a2.Add(a3)).Add(a4)
+
+ for ; i < len(x)-a.Len()+1; i += a.Len() {
+ u := simd.LoadFloat32s(x[i : i+a.Len()])
+ v := simd.LoadFloat32s(y[i : i+a.Len()])
+ a = a.Add(u.Mul(v))
+ }
+ if i < len(x) {
+ a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
+ Mul(first(simd.LoadFloat32sPart(y[i:]))))
+ }
+
+ return sum(a), sumWidth, emulated
+}
+
+func ipFMA(x, y []float32) (float32, int, bool) {
+ var a simd.Float32s
+ sumWidth := a.Len() * 32
+ emulated := simd.Emulated()
+ var i int
+ for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
+ u := simd.LoadFloat32s(x[i : i+a.Len()])
+ v := simd.LoadFloat32s(y[i : i+a.Len()])
+ a = u.MulAdd(v, a)
+ }
+ if i < len(x) {
+ a = first(simd.LoadFloat32sPart(x[i:])).MulAdd(
+ first(simd.LoadFloat32sPart(y[i:])), a)
+ }
+
+ return sum(a), sumWidth, emulated
+}
+
func ipGoTo(x, y []float32) (float32, int, bool) {
var a simd.Float32s
sumWidth := a.Len() * 32
diff --git a/src/simd/simd_emulated.go b/src/simd/simd_emulated.go
index a962558..e705c97 100644
--- a/src/simd/simd_emulated.go
+++ b/src/simd/simd_emulated.go
@@ -2537,9 +2537,10 @@
// Add returns the element-wise sum of x and y.
func (x Float32s) Add(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i))
- }
+ res.set(0, x.get(0)+y.get(0))
+ res.set(1, x.get(1)+y.get(1))
+ res.set(2, x.get(2)+y.get(2))
+ res.set(3, x.get(3)+y.get(3))
return res
}

@@ -2667,18 +2668,21 @@
// Mul returns the element-wise product of x and y.
func (x Float32s) Mul(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)*y.get(i))
- }
+ res.set(0, x.get(0)*y.get(0))
+ res.set(1, x.get(1)*y.get(1))
+ res.set(2, x.get(2)*y.get(2))
+ res.set(3, x.get(3)*y.get(3))
+
return res
}

// MulAdd returns x * y + z element-wise.
func (x Float32s) MulAdd(y, z Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i)*z.get(i))
- }
+ res.set(0, x.get(0)+y.get(0)*z.get(0))
+ res.set(1, x.get(1)+y.get(1)*z.get(1))
+ res.set(2, x.get(2)+y.get(2)*z.get(2))
+ res.set(3, x.get(3)+y.get(3)*z.get(3))
return res
}

Change information

Files:
  • M src/simd/internal/bridge/simd_emulated.go
  • M src/simd/ip_test.go
  • M src/simd/simd_emulated.go
Change size: L
Delta: 3 files changed, 354 insertions(+), 18 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: I09b912a3b68431972eeb91200f77a3dd95327d91
Gerrit-Change-Number: 787561
Gerrit-PatchSet: 1
Gerrit-Owner: David Chase <drc...@google.com>
Gerrit-Reviewer: David Chase <drc...@google.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

David Chase (Gerrit)

unread,
Jun 5, 2026, 10:14:03 PM (4 days ago) Jun 5
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
Attention needed from David Chase

David Chase uploaded new patchset

David Chase uploaded patch set #2 to this change.
Following approvals got outdated and were removed:
Open in Gerrit

Related details

Attention is currently required from:
  • David Chase
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newpatchset
Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: I09b912a3b68431972eeb91200f77a3dd95327d91
Gerrit-Change-Number: 787561
Gerrit-PatchSet: 2
Gerrit-Owner: David Chase <drc...@google.com>
Gerrit-Reviewer: David Chase <drc...@google.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

David Chase (Gerrit)

unread,
Jun 9, 2026, 5:23:11 PM (12 hours ago) Jun 9
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
Attention needed from David Chase

David Chase uploaded new patchset

David Chase uploaded patch set #5 to this change.
Following approvals got outdated and were removed:
Open in Gerrit

Related details

Attention is currently required from:
  • David Chase
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newpatchset
Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: I09b912a3b68431972eeb91200f77a3dd95327d91
Gerrit-Change-Number: 787561
Gerrit-PatchSet: 5
unsatisfied_requirement
satisfied_requirement
open
diffy
Reply all
Reply to author
Forward
0 new messages