[go/dev.simd] [dev.simd] simd: add arm64 ADDV (SumAcross) example

1 view
Skip to first unread message

Alexander Musman (Gerrit)

unread,
Nov 6, 2025, 6:09:31 AM (5 days ago) Nov 6
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Alexander Musman has uploaded the change for review

Commit message

[dev.simd] simd: add arm64 ADDV (SumAcross) example
Change-Id: If1d3f76a2619099c4c0bed3d6c30cea716143ce8

Change diff

diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go
index d0578bc..5366780 100644
--- a/src/cmd/compile/internal/arm64/simdssa.go
+++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -72,6 +72,15 @@
case ssa.OpARM64VDUPS:
p = simdV11ImmIn1(s, v, arm64.ARNG_S)

+ case ssa.OpARM64VADDV16B:
+ p = simdV11Scalar(s, v, arm64.ARNG_16B)
+
+ case ssa.OpARM64VADDV4S:
+ p = simdV11Scalar(s, v, arm64.ARNG_4S)
+
+ case ssa.OpARM64VADDV8H:
+ p = simdV11Scalar(s, v, arm64.ARNG_8H)
+
case ssa.OpARM64VEXT16B:
p = simdV21Imm8(s, v, arm64.ARNG_16B)

diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index db3c356..1f7c94d 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -1779,6 +1779,16 @@
return p
}

+// simdV11Scalar generates vector-to-scalar reduction operations, e.g. VUADDLV V1.B8, V0
+func simdV11Scalar(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg, p.From.Class = simdReg(v.Args[0], arrangement)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Reg() - arm64.REG_F0 + arm64.REG_V0
+ return p
+}
+
// simdV11ImmIn1 generates a SIMD instruction with indexed input and
// vector output: op Vn[imm], Vd
// For example: VDUP V1.S[0], V0.4S
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
index dc345f9..de0caf5 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -84,6 +84,12 @@
(SubUint16x8 ...) => (VSUB8H ...)
(SubUint32x4 ...) => (VSUB4S ...)
(SubUint64x2 ...) => (VSUB2D ...)
+(SumAcrossInt8x16 ...) => (VADDV16B ...)
+(SumAcrossInt16x8 ...) => (VADDV8H ...)
+(SumAcrossInt32x4 ...) => (VADDV4S ...)
+(SumAcrossUint8x16 ...) => (VADDV16B ...)
+(SumAcrossUint16x8 ...) => (VADDV8H ...)
+(SumAcrossUint32x4 ...) => (VADDV4S ...)
(TestInt8x16 ...) => (VCMTST16B ...)
(TestInt16x8 ...) => (VCMTST8H ...)
(TestInt32x4 ...) => (VCMTST4S ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
index d801167..9e8aebf 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -12,6 +12,9 @@
{name: "VADDP4S", argLength: 2, reg: v21, asm: "VADDP", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VADDP8H", argLength: 2, reg: v21, asm: "VADDP", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VADDP16B", argLength: 2, reg: v21, asm: "VADDP", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VADDV4S", argLength: 1, reg: v11, asm: "VADDV", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VADDV8H", argLength: 1, reg: v11, asm: "VADDV", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VADDV16B", argLength: 1, reg: v11, asm: "VADDV", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VAND16B", argLength: 2, reg: v21, asm: "VAND", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VBIT16B", argLength: 3, reg: v31, asm: "VBIT", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VCMEQ2D", argLength: 2, reg: v21, asm: "VCMEQ", commutative: true, typ: "Vec128", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 1d94465..47ac503 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1019,6 +1019,12 @@
{name: "SumAbsDiffUint8x16", argLength: 2, commutative: false},
{name: "SumAbsDiffUint8x32", argLength: 2, commutative: false},
{name: "SumAbsDiffUint8x64", argLength: 2, commutative: false},
+ {name: "SumAcrossInt8x16", argLength: 1, commutative: false},
+ {name: "SumAcrossInt16x8", argLength: 1, commutative: false},
+ {name: "SumAcrossInt32x4", argLength: 1, commutative: false},
+ {name: "SumAcrossUint8x16", argLength: 1, commutative: false},
+ {name: "SumAcrossUint16x8", argLength: 1, commutative: false},
+ {name: "SumAcrossUint32x4", argLength: 1, commutative: false},
{name: "TestInt8x16", argLength: 2, commutative: true},
{name: "TestInt16x8", argLength: 2, commutative: true},
{name: "TestInt32x4", argLength: 2, commutative: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 3d81779..34e6dfb 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -3117,6 +3117,9 @@
OpARM64VADDP4S
OpARM64VADDP8H
OpARM64VADDP16B
+ OpARM64VADDV4S
+ OpARM64VADDV8H
+ OpARM64VADDV16B
OpARM64VAND16B
OpARM64VBIT16B
OpARM64VCMEQ2D
@@ -5869,6 +5872,12 @@
OpSumAbsDiffUint8x16
OpSumAbsDiffUint8x32
OpSumAbsDiffUint8x64
+ OpSumAcrossInt8x16
+ OpSumAcrossInt16x8
+ OpSumAcrossInt32x4
+ OpSumAcrossUint8x16
+ OpSumAcrossUint16x8
+ OpSumAcrossUint32x4
OpTestInt8x16
OpTestInt16x8
OpTestInt32x4
@@ -46374,6 +46383,45 @@
},
},
{
+ name: "VADDV4S",
+ argLen: 1,
+ asm: arm64.AVADDV,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VADDV8H",
+ argLen: 1,
+ asm: arm64.AVADDV,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VADDV16B",
+ argLen: 1,
+ asm: arm64.AVADDV,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
name: "VAND16B",
argLen: 2,
commutative: true,
@@ -71691,6 +71739,36 @@
generic: true,
},
{
+ name: "SumAcrossInt8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "SumAcrossInt16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "SumAcrossInt32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "SumAcrossUint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "SumAcrossUint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "SumAcrossUint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
name: "TestInt8x16",
argLen: 2,
commutative: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index c811576..92e921d 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -1350,6 +1350,24 @@
case OpSubUint8x16:
v.Op = OpARM64VSUB16B
return true
+ case OpSumAcrossInt16x8:
+ v.Op = OpARM64VADDV8H
+ return true
+ case OpSumAcrossInt32x4:
+ v.Op = OpARM64VADDV4S
+ return true
+ case OpSumAcrossInt8x16:
+ v.Op = OpARM64VADDV16B
+ return true
+ case OpSumAcrossUint16x8:
+ v.Op = OpARM64VADDV8H
+ return true
+ case OpSumAcrossUint32x4:
+ v.Op = OpARM64VADDV4S
+ return true
+ case OpSumAcrossUint8x16:
+ v.Op = OpARM64VADDV16B
+ return true
case OpTailCall:
v.Op = OpARM64CALLtail
return true
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 5d7c6ed..5ee7766 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1780,6 +1780,12 @@
addF(simdPackage, "Uint16x8.Sub", opLen2(ssa.OpSubUint16x8, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Uint32x4.Sub", opLen2(ssa.OpSubUint32x4, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Uint64x2.Sub", opLen2(ssa.OpSubUint64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int8x16.SumAcross", opLen1(ssa.OpSumAcrossInt8x16, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int16x8.SumAcross", opLen1(ssa.OpSumAcrossInt16x8, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int32x4.SumAcross", opLen1(ssa.OpSumAcrossInt32x4, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint8x16.SumAcross", opLen1(ssa.OpSumAcrossUint8x16, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint16x8.SumAcross", opLen1(ssa.OpSumAcrossUint16x8, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint32x4.SumAcross", opLen1(ssa.OpSumAcrossUint32x4, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Int8x16.Test", opLen2(ssa.OpTestInt8x16, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Int16x8.Test", opLen2(ssa.OpTestInt16x8, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Int32x4.Test", opLen2(ssa.OpTestInt32x4, types.TypeVec128), sys.ARM64)
diff --git a/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml b/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
index 3a6ecbb..52ba227 100644
--- a/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
+++ b/src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
@@ -21,11 +21,12 @@
in: *2any
out: *1any

-# TODO
-#- go: SumAcross
-# asm: "VADDV"
-# in:
-# - &any
-# go: $t
-# out:
-# - *any
+- go: SumAcross
+ asm: "VADDV"
+ in:
+ - &any
+ go: $t
+ out:
+ - &any
+ go: $t
+ treatLikeAScalarOfSize: 0
diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go
index 66350d4..519d4da 100644
--- a/src/simd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go
@@ -113,7 +113,7 @@
panic(fmt.Errorf("unsupported register constraint, please update the template and AMD64Ops.go: %s. Op is %s", regInfo, op))
}
var outType string
- if shapeOut == OneVregOut || shapeOut == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
+ if shapeOut == OneVregOut || shapeOut == OneVregOutAtIn || shapeOut == OneVregOutScalar || gOp.Out[0].OverwriteClass != nil {
// If class overwrite is happening, that's not really a mask but a vreg.
outType = fmt.Sprintf("Vec%d", *gOp.Out[0].Bits)
} else if shapeOut == OneGregOut {
diff --git a/src/simd/_gen/simdgen/gen_simdrules.go b/src/simd/_gen/simdgen/gen_simdrules.go
index e824fe7..311436e 100644
--- a/src/simd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/_gen/simdgen/gen_simdrules.go
@@ -167,7 +167,7 @@
}
var tplName string
// If class overwrite is happening, that's not really a mask but a vreg.
- if opOutShape == OneVregOut || opOutShape == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
+ if opOutShape == OneVregOut || opOutShape == OneVregOutAtIn || opOutShape == OneVregOutScalar || gOp.Out[0].OverwriteClass != nil {
switch opInShape {
case OneImmIn:
tplName = "pureVreg"
diff --git a/src/simd/_gen/simdgen/gen_simdssa.go b/src/simd/_gen/simdgen/gen_simdssa.go
index ed92706..6e10c18 100644
--- a/src/simd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/_gen/simdgen/gen_simdssa.go
@@ -105,6 +105,7 @@
"v3kv",
"v11Imm8",
"v11ImmIn1",
+ "v11Scalar",
"vkvImm8",
"v21Imm8",
"v21List",
@@ -149,6 +150,8 @@
}
if shapeOut == OneVregOutAtIn {
regShape += "ResultInArg0"
+ } else if shapeOut == OneVregOutScalar {
+ regShape += "Scalar"
}
if shapeIn == OneImmIn || shapeIn == OneKmaskImmIn {
if immOpArg != "" {
diff --git a/src/simd/_gen/simdgen/gen_utility.go b/src/simd/_gen/simdgen/gen_utility.go
index 1e5f163c7..a17d3ac 100644
--- a/src/simd/_gen/simdgen/gen_utility.go
+++ b/src/simd/_gen/simdgen/gen_utility.go
@@ -91,12 +91,13 @@
)

const (
- InvalidOut outShape = iota
- NoOut // no output
- OneVregOut // (one) vector register output
- OneGregOut // (one) general register output
- OneKmaskOut // mask output
- OneVregOutAtIn // the first input is also the output
+ InvalidOut outShape = iota
+ NoOut // no output
+ OneVregOut // (one) vector register output
+ OneGregOut // (one) general register output
+ OneKmaskOut // mask output
+ OneVregOutAtIn // the first input is also the output
+ OneVregOutScalar // the vector register output gets scalar result in lane 0 and other lanes are zeroed
)

const (
@@ -130,6 +131,9 @@
outputReg = op.Out[0].AsmPos
if op.Out[0].Class == "vreg" {
shapeOut = OneVregOut
+ if op.Out[0].TreatLikeAScalarOfSize != nil {
+ shapeOut = OneVregOutScalar
+ }
} else if op.Out[0].Class == "greg" {
shapeOut = OneGregOut
} else if op.Out[0].Class == "mask" {
diff --git a/src/simd/ops_arm64.go b/src/simd/ops_arm64.go
index fd0828e..c19e4d6 100644
--- a/src/simd/ops_arm64.go
+++ b/src/simd/ops_arm64.go
@@ -534,6 +534,44 @@
// Asm: VSUB, CPU Feature: NEON
func (x Uint64x2) Sub(y Uint64x2) Uint64x2

+/* SumAcross */
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Int8x16) SumAcross() Int8x16
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Int16x8) SumAcross() Int16x8
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Int32x4) SumAcross() Int32x4
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Uint8x16) SumAcross() Uint8x16
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Uint16x8) SumAcross() Uint16x8
+
+// SumAcross sums all elements across a vector.
+// For x = [x0, x1, ..., xn] the result is [x0+x1+...+xn, 0, 0, ...].
+//
+// Asm: VADDV, CPU Feature: NEON
+func (x Uint32x4) SumAcross() Uint32x4
+
/* Test */

// Test tests bitwise AND of corresponding elements is non-zero.

Change information

Files:
  • M src/cmd/compile/internal/arm64/simdssa.go
  • M src/cmd/compile/internal/arm64/ssa.go
  • M src/cmd/compile/internal/ssa/_gen/simdARM64.rules
  • M src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
  • M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
  • M src/cmd/compile/internal/ssa/opGen.go
  • M src/cmd/compile/internal/ssa/rewriteARM64.go
  • M src/cmd/compile/internal/ssagen/simdintrinsics.go
  • M src/simd/_gen/simdgen/arm64/ops/AddSub/go.yaml
  • M src/simd/_gen/simdgen/gen_simdMachineOps.go
  • M src/simd/_gen/simdgen/gen_simdrules.go
  • M src/simd/_gen/simdgen/gen_simdssa.go
  • M src/simd/_gen/simdgen/gen_utility.go
  • M src/simd/ops_arm64.go
Change size: M
Delta: 14 files changed, 198 insertions(+), 16 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: If1d3f76a2619099c4c0bed3d6c30cea716143ce8
Gerrit-Change-Number: 718400
Gerrit-PatchSet: 1
Gerrit-Owner: Alexander Musman <alexande...@gmail.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

Alexander Musman (Gerrit)

unread,
Nov 6, 2025, 6:45:45 AM (5 days ago) Nov 6
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Alexander Musman uploaded new patchset

Alexander Musman uploaded patch set #2 to this change.
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newpatchset
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: If1d3f76a2619099c4c0bed3d6c30cea716143ce8
Gerrit-Change-Number: 718400
Gerrit-PatchSet: 2
Gerrit-Owner: Alexander Musman <alexande...@gmail.com>
unsatisfied_requirement
satisfied_requirement
open
diffy
Reply all
Reply to author
Forward
0 new messages