diff --git a/src/simd/internal/bridge/simd_emulated.go b/src/simd/internal/bridge/simd_emulated.go
index ba4d1b2..58527f88 100644
--- a/src/simd/internal/bridge/simd_emulated.go
+++ b/src/simd/internal/bridge/simd_emulated.go
@@ -2537,9 +2537,10 @@
// Add returns the element-wise sum of x and y.
func (x Float32s) Add(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i))
- }
+ res.set(0, x.get(0)+y.get(0))
+ res.set(1, x.get(1)+y.get(1))
+ res.set(2, x.get(2)+y.get(2))
+ res.set(3, x.get(3)+y.get(3))
return res
}
@@ -2667,18 +2668,21 @@
// Mul returns the element-wise product of x and y.
func (x Float32s) Mul(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)*y.get(i))
- }
+ res.set(0, x.get(0)*y.get(0))
+ res.set(1, x.get(1)*y.get(1))
+ res.set(2, x.get(2)*y.get(2))
+ res.set(3, x.get(3)*y.get(3))
+
return res
}
// MulAdd returns x * y + z element-wise.
func (x Float32s) MulAdd(y, z Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i)*z.get(i))
- }
+ res.set(0, x.get(0)+y.get(0)*z.get(0))
+ res.set(1, x.get(1)+y.get(1)*z.get(1))
+ res.set(2, x.get(2)+y.get(2)*z.get(2))
+ res.set(3, x.get(3)+y.get(3)*z.get(3))
return res
}
diff --git a/src/simd/ip_test.go b/src/simd/ip_test.go
index 90aea58..ebd8b4f 100644
--- a/src/simd/ip_test.go
+++ b/src/simd/ip_test.go
@@ -8,6 +8,7 @@
import (
"fmt"
+ "math/rand/v2"
"simd"
"testing"
)
@@ -48,6 +49,225 @@
return t
}
+const ipBenchLen = 300000
+
+// BenchmarkIP is simd vector inner product, vanilla transcription.
+func BenchmarkIP(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ip(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ip(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops.
+func BenchmarkIPUnroll(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ipU(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ipU(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops
+func BenchmarkIPUnrollMore(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ipUmore(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ipUmore(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPFMA is simd vector inner product computing using FMA.
+func BenchmarkIPFMA(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0, _, _ := ipFMA(x, y)
+
+ var errors int
+ for b.Loop() {
+ z, _, _ := ipFMA(x, y)
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// ipNosimd computes inner product with serial
+// addition order of the terms (to make the)
+// check comparison turn out right.
+func ipNosimd(x, y []float32) float32 {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ return z
+}
+
+// BenchmarkIPnosimd1 is serial, just a vanilla inner product.
+func BenchmarkIPnosimd0(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 := ipNosimd(x, y)
+
+ var errors int
+ for b.Loop() {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPnosimd1 is serial, but with a no-op subslice that
+// makes it clear that x and y have the same length.
+func BenchmarkIPnosimd1(b *testing.B) {
+ x := make([]float32, ipBenchLen)
+ y := make([]float32, ipBenchLen)
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 := ipNosimd(x, y)
+
+ var errors int
+ for b.Loop() {
+ var z float32
+ yy := y[:(len(x))]
+ for i, a := range x {
+ z += a * yy[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+// BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices,
+// so no bounds checking, gosh darn it to heck.
+func BenchmarkIPnosimdA(b *testing.B) {
+ var x, y [ipBenchLen]float32
+
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 := ipNosimd(x[:], y[:])
+
+ var errors int
+ for b.Loop() {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
+var x, y [ipBenchLen]float32
+var ip0 float32
+
+func initIp0() {
+ for i := range x {
+ x[i] = 2*rand.Float32() - 1
+ y[i] = 2*rand.Float32() - 1
+ }
+ ip0 = ipNosimd(x[:], y[:])
+}
+
+// BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices,
+// and using a classic iterated loop to see if b.Loop affects subscript inference,
+// so no bounds checking, gosh darn it to heck, this time, for sure.
+func BenchmarkIPnosimdAnotBloop(b *testing.B) {
+ if ip0 == 0 {
+ initIp0()
+ }
+
+ var errors int
+ for range b.N {
+ var z float32
+ for i, a := range x {
+ z += a * y[i]
+ }
+ if z != ip0 {
+ errors++
+ }
+ }
+ if errors > 0 {
+ b.Logf("errors = %d\n", errors)
+ }
+}
+
func ip(x, y []float32) (float32, int, bool) {
var a simd.Float32s
sumWidth := a.Len() * 32
@@ -66,6 +286,114 @@
return sum(a), sumWidth, emulated
}
+func ipU(x, y []float32) (float32, int, bool) {
+ const U = 4
+ var a, a0, a1, a2, a3 simd.Float32s
+ sumWidth := a.Len() * 32
+ emulated := simd.Emulated()
+ var i int
+ for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
+ i0 := i
+ i1 := i + a.Len()
+ i2 := i + 2*a.Len()
+ i3 := i + 3*a.Len()
+
+ u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
+ v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
+ a0 = a0.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
+ v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
+ a1 = a1.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
+ v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
+ a2 = a2.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
+ v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
+ a3 = a3.Add(u.Mul(v))
+ }
+ a = a0.Add(a1).Add(a2.Add(a3))
+ for ; i < len(x)-a.Len()+1; i += a.Len() {
+ u := simd.LoadFloat32s(x[i : i+a.Len()])
+ v := simd.LoadFloat32s(y[i : i+a.Len()])
+ a = a.Add(u.Mul(v))
+ }
+ if i < len(x) {
+ a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
+ Mul(first(simd.LoadFloat32sPart(y[i:]))))
+ }
+
+ return sum(a), sumWidth, emulated
+}
+
+func ipUmore(x, y []float32) (float32, int, bool) {
+ const U = 5
+ var a, a0, a1, a2, a3, a4 simd.Float32s
+ sumWidth := a.Len() * 32
+ emulated := simd.Emulated()
+ var i int
+ for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
+ i0 := i
+ i1 := i + a.Len()
+ i2 := i + 2*a.Len()
+ i3 := i + 3*a.Len()
+ i4 := i + 3*a.Len()
+
+ u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
+ v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
+ a0 = a0.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
+ v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
+ a1 = a1.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
+ v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
+ a2 = a2.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
+ v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
+ a3 = a3.Add(u.Mul(v))
+
+ u = simd.LoadFloat32s(x[i4 : i4+a.Len()])
+ v = simd.LoadFloat32s(y[i4 : i4+a.Len()])
+ a4 = a4.Add(u.Mul(v))
+ }
+ a = a0.Add(a1).Add(a2.Add(a3)).Add(a4)
+
+ for ; i < len(x)-a.Len()+1; i += a.Len() {
+ u := simd.LoadFloat32s(x[i : i+a.Len()])
+ v := simd.LoadFloat32s(y[i : i+a.Len()])
+ a = a.Add(u.Mul(v))
+ }
+ if i < len(x) {
+ a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
+ Mul(first(simd.LoadFloat32sPart(y[i:]))))
+ }
+
+ return sum(a), sumWidth, emulated
+}
+
+func ipFMA(x, y []float32) (float32, int, bool) {
+ var a simd.Float32s
+ sumWidth := a.Len() * 32
+ emulated := simd.Emulated()
+ var i int
+ for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
+ u := simd.LoadFloat32s(x[i : i+a.Len()])
+ v := simd.LoadFloat32s(y[i : i+a.Len()])
+ a = u.MulAdd(v, a)
+ }
+ if i < len(x) {
+ a = first(simd.LoadFloat32sPart(x[i:])).MulAdd(
+ first(simd.LoadFloat32sPart(y[i:])), a)
+ }
+
+ return sum(a), sumWidth, emulated
+}
+
func ipGoTo(x, y []float32) (float32, int, bool) {
var a simd.Float32s
sumWidth := a.Len() * 32
diff --git a/src/simd/simd_emulated.go b/src/simd/simd_emulated.go
index a962558..e705c97 100644
--- a/src/simd/simd_emulated.go
+++ b/src/simd/simd_emulated.go
@@ -2537,9 +2537,10 @@
// Add returns the element-wise sum of x and y.
func (x Float32s) Add(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i))
- }
+ res.set(0, x.get(0)+y.get(0))
+ res.set(1, x.get(1)+y.get(1))
+ res.set(2, x.get(2)+y.get(2))
+ res.set(3, x.get(3)+y.get(3))
return res
}
@@ -2667,18 +2668,21 @@
// Mul returns the element-wise product of x and y.
func (x Float32s) Mul(y Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)*y.get(i))
- }
+ res.set(0, x.get(0)*y.get(0))
+ res.set(1, x.get(1)*y.get(1))
+ res.set(2, x.get(2)*y.get(2))
+ res.set(3, x.get(3)*y.get(3))
+
return res
}
// MulAdd returns x * y + z element-wise.
func (x Float32s) MulAdd(y, z Float32s) Float32s {
var res Float32s
- for i := 0; i < 4; i++ {
- res.set(i, x.get(i)+y.get(i)*z.get(i))
- }
+ res.set(0, x.get(0)+y.get(0)*z.get(0))
+ res.set(1, x.get(1)+y.get(1)*z.get(1))
+ res.set(2, x.get(2)+y.get(2)*z.get(2))
+ res.set(3, x.get(3)+y.get(3)*z.get(3))
return res
}