[go/dev.simd] [dev.simd] simd: add arm64 ADDV (SumAcross) example

1 view

Skip to first unread message

Alexander Musman (Gerrit)

unread,

Nov 6, 2025, 6:09:31 AM (5 days ago) Nov 6

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Alexander Musman has uploaded the change for review

Commit message

[dev.simd] simd: add arm64 ADDV (SumAcross) example

Change-Id: If1d3f76a2619099c4c0bed3d6c30cea716143ce8

Change diff

diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go
index d0578bc..5366780 100644
--- a/src/cmd/compile/internal/arm64/simdssa.go
+++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -72,6 +72,15 @@
 	case ssa.OpARM64VDUPS:
 		p = simdV11ImmIn1(s, v, arm64.ARNG_S)
 
+	case ssa.OpARM64VADDV16B:
+		p = simdV11Scalar(s, v, arm64.ARNG_16B)
+
+	case ssa.OpARM64VADDV4S:
+		p = simdV11Scalar(s, v, arm64.ARNG_4S)
+
+	case ssa.OpARM64VADDV8H:
+		p = simdV11Scalar(s, v, arm64.ARNG_8H)
+
 	case ssa.OpARM64VEXT16B:
 		p = simdV21Imm8(s, v, arm64.ARNG_16B)
 
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index db3c356..1f7c94d 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -1779,6 +1779,16 @@
 	return p
 }
 
+// simdV11Scalar generates vector-to-scalar reduction operations, e.g. VUADDLV V1.B8, V0
+func simdV11Scalar(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
+	p := s.Prog(v.Op.Asm())
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg, p.From.Class = simdReg(v.Args[0], arrangement)
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = v.Reg() - arm64.REG_F0 + arm64.REG_V0
+	return p
+}
+
 // simdV11ImmIn1 generates a SIMD instruction with indexed input and
 // vector output: op Vn[imm], Vd
 // For example: VDUP V1.S[0], V0.4S
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
index dc345f9..de0caf5 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -84,6 +84,12 @@
 (SubUint16x8 ...) => (VSUB8H ...)
 (SubUint32x4 ...) => (VSUB4S ...)
 (SubUint64x2 ...) => (VSUB2D ...)
+(SumAcrossInt8x16 ...) => (VADDV16B ...)
+(SumAcrossInt16x8 ...) => (VADDV8H ...)
+(SumAcrossInt32x4 ...) => (VADDV4S ...)
+(SumAcrossUint8x16 ...) => (VADDV16B ...)
+(SumAcrossUint16x8 ...) => (VADDV8H ...)
+(SumAcrossUint32x4 ...) => (VADDV4S ...)
 (TestInt8x16 ...) => (VCMTST16B ...)
 (TestInt16x8 ...) => (VCMTST8H ...)
 (TestInt32x4 ...) => (VCMTST4S ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
index d801167..9e8aebf 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -12,6 +12,9 @@
 		{name: "VADDP4S", argLength: 2, reg: v21, asm: "VADDP", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VADDP8H", argLength: 2, reg: v21, asm: "VADDP", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VADDP16B", argLength: 2, reg: v21, asm: "VADDP", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VADDV4S", argLength: 1, reg: v11, asm: "VADDV", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VADDV8H", argLength: 1, reg: v11, asm: "VADDV", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VADDV16B", argLength: 1, reg: v11, asm: "VADDV", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VAND16B", argLength: 2, reg: v21, asm: "VAND", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VBIT16B", argLength: 3, reg: v31, asm: "VBIT", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VCMEQ2D", argLength: 2, reg: v21, asm: "VCMEQ", commutative: true, typ: "Vec128", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 1d94465..47ac503 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1019,6 +1019,12 @@
 		{name: "SumAbsDiffUint8x16", argLength: 2, commutative: false},
 		{name: "SumAbsDiffUint8x32", argLength: 2, commutative: false},
 		{name: "SumAbsDiffUint8x64", argLength: 2, commutative: false},
+		{name: "SumAcrossInt8x16", argLength: 1, commutative: false},
+		{name: "SumAcrossInt16x8", argLength: 1, commutative: false},
+		{name: "SumAcrossInt32x4", argLength: 1, commutative: false},
+		{name: "SumAcrossUint8x16", argLength: 1, commutative: false},
+		{name: "SumAcrossUint16x8", argLength: 1, commutative: false},
+		{name: "SumAcrossUint32x4", argLength: 1, commutative: false},
 		{name: "TestInt8x16", argLength: 2, commutative: true},
 		{name: "TestInt16x8", argLength: 2, commutative: true},
 		{name: "TestInt32x4", argLength: 2, commutative: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 3d81779..34e6dfb 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -3117,6 +3117,9 @@
 	OpARM64VADDP4S
 	OpARM64VADDP8H
 	OpARM64VADDP16B
+	OpARM64VADDV4S
+	OpARM64VADDV8H
+	OpARM64VADDV16B
 	OpARM64VAND16B
 	OpARM64VBIT16B
 	OpARM64VCMEQ2D
@@ -5869,6 +5872,12 @@
 	OpSumAbsDiffUint8x16
 	OpSumAbsDiffUint8x32
 	OpSumAbsDiffUint8x64
+	OpSumAcrossInt8x16
+	OpSumAcrossInt16x8
+	OpSumAcrossInt32x4
+	OpSumAcrossUint8x16
+	OpSumAcrossUint16x8
+	OpSumAcrossUint32x4
 	OpTestInt8x16
 	OpTestInt16x8
 	OpTestInt32x4
@@ -46374,6 +46383,45 @@
 		},
 	},
 	{
+		name:   "VADDV4S",
+		argLen: 1,
+		asm:    arm64.AVADDV,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
+		name:   "VADDV8H",
+		argLen: 1,
+		asm:    arm64.AVADDV,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
+		name:   "VADDV16B",
+		argLen: 1,
+		asm:    arm64.AVADDV,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
 		name:        "VAND16B",
 		argLen:      2,
 		commutative: true,
@@ -71691,6 +71739,36 @@
 		generic: true,
 	},
 	{
+		name:    "SumAcrossInt8x16",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "SumAcrossInt16x8",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "SumAcrossInt32x4",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "SumAcrossUint8x16",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "SumAcrossUint16x8",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "SumAcrossUint32x4",
+		argLen:  1,
+		generic: true,
+	},
+	{
 		name:        "TestInt8x16",
 		argLen:      2,
 		commutative: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index c811576..92e921d 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -1350,6 +1350,24 @@
 	case OpSubUint8x16:
 		v.Op = OpARM64VSUB16B
 		return true
+	case OpSumAcrossInt16x8:
+		v.Op = OpARM64VADDV8H
+		return true
+	case OpSumAcrossInt32x4:
+		v.Op = OpARM64VADDV4S
+		return true
+	case OpSumAcrossInt8x16:
+		v.Op = OpARM64VADDV16B
+		return true
+	case OpSumAcrossUint16x8:
+		v.Op = OpARM64VADDV8H
+		return true
+	case OpSumAcrossUint32x4:
+		v.Op = OpARM64VADDV4S
+		return true
+	case OpSumAcrossUint8x16:
+		v.Op = OpARM64VADDV16B
+		return true
 	case OpTailCall:
 		v.Op = OpARM64CALLtail
 		return true
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 5d7c6ed..5ee7766 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1780,6 +1780,12 @@
 	addF(simdPackage, "Uint16x8.Sub", opLen2(ssa.OpSubUint16x8, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint32x4.Sub", opLen2(ssa.OpSubUint32x4, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint64x2.Sub", opLen2(ssa.OpSubUint64x2, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int8x16.SumAcross", opLen1(ssa.OpSumAcrossInt8x16, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int16x8.SumAcross", opLen1(ssa.OpSumAcrossInt16x8, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int32x4.SumAcross", opLen1(ssa.OpSumAcrossInt32x4, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint8x16.SumAcross", opLen1(ssa.OpSumAcrossUint8x16, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint16x8.SumAcross", opLen1(ssa.OpSumAcrossUint16x8, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint32x4.SumAcross", opLen1(ssa.OpSumAcrossUint32x4, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Int8x16.Test", opLen2(ssa.OpTestInt8x16, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Int16x8.Test", opLen2(ssa.OpTestInt16x8, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Int32x4.Test", opLen2(ssa.OpTestInt32x4, types.TypeVec128), sys.ARM64)
diff --git a/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml b/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
index 3a6ecbb..52ba227 100644
--- a/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
+++ b/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
@@ -21,11 +21,12 @@
   in: *2any
   out: *1any
 
-# TODO
-#- go: SumAcross
-#  asm: "VADDV"
-#  in:
-#  - &any
-#    go: $t
-#  out:
-#  - *any
+- go: SumAcross
+  asm: "VADDV"
+  in:
+  - &any
+    go: $t
+  out:
+  - &any
+    go: $t
+    treatLikeAScalarOfSize: 0
diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go
index 66350d4..519d4da 100644
--- a/src/simd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go
@@ -113,7 +113,7 @@
 			panic(fmt.Errorf("unsupported register constraint, please update the template and AMD64Ops.go: %s.  Op is %s", regInfo, op))
 		}
 		var outType string
-		if shapeOut == OneVregOut || shapeOut == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
+		if shapeOut == OneVregOut || shapeOut == OneVregOutAtIn || shapeOut == OneVregOutScalar || gOp.Out[0].OverwriteClass != nil {
 			// If class overwrite is happening, that's not really a mask but a vreg.
 			outType = fmt.Sprintf("Vec%d", *gOp.Out[0].Bits)
 		} else if shapeOut == OneGregOut {
diff --git a/src/simd/_gen/simdgen/gen_simdrules.go b/src/simd/_gen/simdgen/gen_simdrules.go
index e824fe7..311436e 100644
--- a/src/simd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/_gen/simdgen/gen_simdrules.go
@@ -167,7 +167,7 @@
 		}
 		var tplName string
 		// If class overwrite is happening, that's not really a mask but a vreg.
-		if opOutShape == OneVregOut || opOutShape == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
+		if opOutShape == OneVregOut || opOutShape == OneVregOutAtIn || opOutShape == OneVregOutScalar || gOp.Out[0].OverwriteClass != nil {
 			switch opInShape {
 			case OneImmIn:
 				tplName = "pureVreg"
diff --git a/src/simd/_gen/simdgen/gen_simdssa.go b/src/simd/_gen/simdgen/gen_simdssa.go
index ed92706..6e10c18 100644
--- a/src/simd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/_gen/simdgen/gen_simdssa.go
@@ -105,6 +105,7 @@
 		"v3kv",
 		"v11Imm8",
 		"v11ImmIn1",
+		"v11Scalar",
 		"vkvImm8",
 		"v21Imm8",
 		"v21List",
@@ -149,6 +150,8 @@
 		}
 		if shapeOut == OneVregOutAtIn {
 			regShape += "ResultInArg0"
+		} else if shapeOut == OneVregOutScalar {
+			regShape += "Scalar"
 		}
 		if shapeIn == OneImmIn || shapeIn == OneKmaskImmIn {
 			if immOpArg != "" {
diff --git a/src/simd/_gen/simdgen/gen_utility.go b/src/simd/_gen/simdgen/gen_utility.go
index 1e5f163c7..a17d3ac 100644
--- a/src/simd/_gen/simdgen/gen_utility.go
+++ b/src/simd/_gen/simdgen/gen_utility.go
@@ -91,12 +91,13 @@
 )
 
 const (
-	InvalidOut     outShape = iota
-	NoOut                   // no output
-	OneVregOut              // (one) vector register output
-	OneGregOut              // (one) general register output
-	OneKmaskOut             // mask output
-	OneVregOutAtIn          // the first input is also the output
+	InvalidOut       outShape = iota
+	NoOut                     // no output
+	OneVregOut                // (one) vector register output
+	OneGregOut                // (one) general register output
+	OneKmaskOut               // mask output
+	OneVregOutAtIn            // the first input is also the output
+	OneVregOutScalar          // the vector register output gets scalar result in lane 0 and other lanes are zeroed
 )
 
 const (
@@ -130,6 +131,9 @@
 		outputReg = op.Out[0].AsmPos
 		if op.Out[0].Class == "vreg" {
 			shapeOut = OneVregOut
+			if op.Out[0].TreatLikeAScalarOfSize != nil {
+				shapeOut = OneVregOutScalar
+			}
 		} else if op.Out[0].Class == "greg" {
 			shapeOut = OneGregOut
 		} else if op.Out[0].Class == "mask" {
diff --git a/src/simd/ops_arm64.go b/src/simd/ops_arm64.go
index fd0828e..c19e4d6 100644
--- a/src/simd/ops_arm64.go
+++ b/src/simd/ops_arm64.go
@@ -534,6 +534,44 @@
 // Asm: VSUB, CPU Feature: NEON
 func (x Uint64x2) Sub(y Uint64x2) Uint64x2
 
+/* SumAcross */
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Int8x16) SumAcross() Int8x16
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Int16x8) SumAcross() Int16x8
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Int32x4) SumAcross() Int32x4
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Uint8x16) SumAcross() Uint8x16
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Uint16x8) SumAcross() Uint16x8
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Uint32x4) SumAcross() Uint32x4
+
 /* Test */
 
 // Test tests bitwise AND of corresponding elements is non-zero.

Change information

Files:

M src/cmd/compile/internal/arm64/simdssa.go
M src/cmd/compile/internal/arm64/ssa.go
M src/cmd/compile/internal/ssa/_gen/simdARM64.rules
M src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
M src/cmd/compile/internal/ssa/opGen.go
M src/cmd/compile/internal/ssa/rewriteARM64.go
M src/cmd/compile/internal/ssagen/simdintrinsics.go
M src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
M src/simd/_gen/simdgen/gen_simdMachineOps.go
M src/simd/_gen/simdgen/gen_simdrules.go
M src/simd/_gen/simdgen/gen_simdssa.go
M src/simd/_gen/simdgen/gen_utility.go
M src/simd/ops_arm64.go

Change size: M

Delta: 14 files changed, 198 insertions(+), 16 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Alexander Musman (Gerrit)

unread,

Nov 6, 2025, 6:45:45 AM (5 days ago) Nov 6

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Alexander Musman uploaded new patchset

Alexander Musman uploaded patch set #2 to this change.