[go/dev.simd] [dev.simd] simd: add ARM64 NEON Broadcast and String helpers

2 views
Skip to first unread message

Alexander Musman (Gerrit)

unread,
Apr 15, 2026, 9:52:18 AM (14 days ago) Apr 15
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Alexander Musman has uploaded the change for review

Commit message

[dev.simd] simd: add ARM64 NEON Broadcast and String helpers

Add Broadcast functions that replicate a scalar value across all lanes
of a vector, using the ARM64 VDUP instruction.

This adds:
- Broadcast1To{2,4,8,16} methods on vector types (lowering to VDUP)
- Broadcast{Type} constructors (e.g. BroadcastFloat32x4(3.14))
- String() methods on all ARM64 vector types

Supports all element widths: B/H/S/D for signed, unsigned, and float types.

Example demonstrating broadcast and String on ARM64:
```
package main

import (
"fmt"
"simd/archsimd"
)

func main() {
v := archsimd.BroadcastFloat32x4(2.5)
w := v.Add(archsimd.BroadcastFloat32x4(v.GetElem(3)))
fmt.Printf("w = %s\n", w.String())
}
```
Output: w = {5,5,5,5}
Change-Id: Ica3e6c06e4beebd288e82ff961a3201351ecd352

Change diff

diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go
index bc00084..5fe63ab 100644
--- a/src/cmd/compile/internal/arm64/simdssa.go
+++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -12,6 +12,18 @@
func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
var p *obj.Prog
switch v.Op {
+ case ssa.OpARM64VDUPBbcast:
+ p = simdV11ImmIn1(s, v, arm64.ARNG_B)
+
+ case ssa.OpARM64VDUPDbcast:
+ p = simdV11ImmIn1(s, v, arm64.ARNG_D)
+
+ case ssa.OpARM64VDUPHbcast:
+ p = simdV11ImmIn1(s, v, arm64.ARNG_H)
+
+ case ssa.OpARM64VDUPSbcast:
+ p = simdV11ImmIn1(s, v, arm64.ARNG_S)
+
case ssa.OpARM64VDUPDextr:
p = simdV11ScalarImmIn1(s, v, arm64.ARNG_D)

diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 1d837d9..3e03c7f 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -198,6 +198,24 @@
return
}

+// allLanes converts an element arrangement to its 128-bit vector arrangement.
+// e.g., ARNG_B -> ARNG_16B, ARNG_S -> ARNG_4S
+func allLanes(arng int16) int16 {
+ switch arng {
+ case arm64.ARNG_B:
+ return arm64.ARNG_16B
+ case arm64.ARNG_H:
+ return arm64.ARNG_8H
+ case arm64.ARNG_S:
+ return arm64.ARNG_4S
+ case arm64.ARNG_D:
+ return arm64.ARNG_2D
+ default:
+ base.Fatalf("unsupported element arrangement: %d", arng)
+ return 0
+ }
+}
+
// simdV01Imm generates a VMOVI-like instruction, e.g. VMOVI $0, V0.B16
func simdV01Imm(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
p := s.Prog(v.Op.Asm())
@@ -223,6 +241,17 @@
return simdV11Asm(s, v.Op.Asm(), v.Args[0].Reg(), v.Reg(), arrangement)
}

+// simdV11ImmIn1 generates a Broadcast1ToN instruction,
+// e.g. VDUP V1.S[0], V0.S4 (duplicate element 0 to all lanes)
+// The arrangement parameter specifies the element arrangement (e.g., ARNG_S, ARNG_D)
+func simdV11ImmIn1(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
+ p := s.Prog(v.Op.Asm())
+ p.From = simdRegElem(v.Args[0].Reg(), arrangement, int16(v.AuxUInt8()))
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = simdRegArng(v.Reg(), allLanes(arrangement))
+ return p
+}
+
// simdV11Scalar generates vector-to-scalar reduction operations, e.g. VUADDLV V1.B8, V0
func simdV11Scalar(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
p := s.Prog(v.Op.Asm())
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
index b94026b..e5165a9 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -10,6 +10,16 @@
(AddUint16x8 ...) => (VADD8H ...)
(AddUint32x4 ...) => (VADD4S ...)
(AddUint64x2 ...) => (VADD2D ...)
+(Broadcast1To2Float64x2 x) => (VDUPDbcast [0] x)
+(Broadcast1To2Int64x2 x) => (VDUPDbcast [0] x)
+(Broadcast1To2Uint64x2 x) => (VDUPDbcast [0] x)
+(Broadcast1To4Float32x4 x) => (VDUPSbcast [0] x)
+(Broadcast1To4Int32x4 x) => (VDUPSbcast [0] x)
+(Broadcast1To4Uint32x4 x) => (VDUPSbcast [0] x)
+(Broadcast1To8Int16x8 x) => (VDUPHbcast [0] x)
+(Broadcast1To8Uint16x8 x) => (VDUPHbcast [0] x)
+(Broadcast1To16Int8x16 x) => (VDUPBbcast [0] x)
+(Broadcast1To16Uint8x16 x) => (VDUPBbcast [0] x)
(GetElemFloat32x4 ...) => (VDUPSextr ...)
(GetElemFloat64x2 ...) => (VDUPDextr ...)
(GetElemInt8x16 ...) => (VMOVBextr ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
index 2961212..e66c68c 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -15,7 +15,11 @@
{name: "VMUL4S", argLength: 2, reg: v21, asm: "VMUL", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VMUL8H", argLength: 2, reg: v21, asm: "VMUL", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VMUL16B", argLength: 2, reg: v21, asm: "VMUL", commutative: true, typ: "Vec128", resultInArg0: false},
+ {name: "VDUPBbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VDUPDbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VDUPDextr", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VDUPHbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VDUPSbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VDUPSextr", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VMOVBextr", argLength: 1, reg: vgp, asm: "VMOV", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
{name: "VMOVBins", argLength: 2, reg: vgpv, asm: "VMOV", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 3fe2d81..08fe804 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -4507,7 +4507,11 @@
OpARM64VMUL4S
OpARM64VMUL8H
OpARM64VMUL16B
+ OpARM64VDUPBbcast
+ OpARM64VDUPDbcast
OpARM64VDUPDextr
+ OpARM64VDUPHbcast
+ OpARM64VDUPSbcast
OpARM64VDUPSextr
OpARM64VMOVBextr
OpARM64VMOVBins
@@ -70047,6 +70051,34 @@
},
},
{
+ name: "VDUPBbcast",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: arm64.AVDUP,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VDUPDbcast",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: arm64.AVDUP,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
name: "VDUPDextr",
auxType: auxUInt8,
argLen: 1,
@@ -70061,6 +70093,34 @@
},
},
{
+ name: "VDUPHbcast",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: arm64.AVDUP,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VDUPSbcast",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: arm64.AVDUP,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
name: "VDUPSextr",
auxType: auxUInt8,
argLen: 1,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index d4e4fdb..009238c 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -603,6 +603,26 @@
return true
case OpBitRev8:
return rewriteValueARM64_OpBitRev8(v)
+ case OpBroadcast1To16Int8x16:
+ return rewriteValueARM64_OpBroadcast1To16Int8x16(v)
+ case OpBroadcast1To16Uint8x16:
+ return rewriteValueARM64_OpBroadcast1To16Uint8x16(v)
+ case OpBroadcast1To2Float64x2:
+ return rewriteValueARM64_OpBroadcast1To2Float64x2(v)
+ case OpBroadcast1To2Int64x2:
+ return rewriteValueARM64_OpBroadcast1To2Int64x2(v)
+ case OpBroadcast1To2Uint64x2:
+ return rewriteValueARM64_OpBroadcast1To2Uint64x2(v)
+ case OpBroadcast1To4Float32x4:
+ return rewriteValueARM64_OpBroadcast1To4Float32x4(v)
+ case OpBroadcast1To4Int32x4:
+ return rewriteValueARM64_OpBroadcast1To4Int32x4(v)
+ case OpBroadcast1To4Uint32x4:
+ return rewriteValueARM64_OpBroadcast1To4Uint32x4(v)
+ case OpBroadcast1To8Int16x8:
+ return rewriteValueARM64_OpBroadcast1To8Int16x8(v)
+ case OpBroadcast1To8Uint16x8:
+ return rewriteValueARM64_OpBroadcast1To8Uint16x8(v)
case OpBswap16:
v.Op = OpARM64REV16W
return true
@@ -17508,6 +17528,126 @@
return true
}
}
+func rewriteValueARM64_OpBroadcast1To16Int8x16(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To16Int8x16 x)
+ // result: (VDUPBbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPBbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To16Uint8x16(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To16Uint8x16 x)
+ // result: (VDUPBbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPBbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To2Float64x2(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To2Float64x2 x)
+ // result: (VDUPDbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPDbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To2Int64x2(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To2Int64x2 x)
+ // result: (VDUPDbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPDbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To2Uint64x2(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To2Uint64x2 x)
+ // result: (VDUPDbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPDbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To4Float32x4(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To4Float32x4 x)
+ // result: (VDUPSbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPSbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To4Int32x4(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To4Int32x4 x)
+ // result: (VDUPSbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPSbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To4Uint32x4(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To4Uint32x4 x)
+ // result: (VDUPSbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPSbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To8Int16x8(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To8Int16x8 x)
+ // result: (VDUPHbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPHbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueARM64_OpBroadcast1To8Uint16x8(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (Broadcast1To8Uint16x8 x)
+ // result: (VDUPHbcast [0] x)
+ for {
+ x := v_0
+ v.reset(OpARM64VDUPHbcast)
+ v.AuxInt = uint8ToAuxInt(0)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM64_OpCondSelect(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
diff --git a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
index 1467012..d21c176 100644
--- a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
@@ -20,6 +20,16 @@
addF(simdPackage, "Uint16x8.Add", opLen2(ssa.OpAddUint16x8, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Uint32x4.Add", opLen2(ssa.OpAddUint32x4, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.ARM64)
addF(simdPackage, "Float32x4.GetElem", opLen1Imm(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0, 3), sys.ARM64)
addF(simdPackage, "Float64x2.GetElem", opLen1Imm(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0, 1), sys.ARM64)
addF(simdPackage, "Int8x16.GetElem", opLen1Imm(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0, 15), sys.ARM64)
diff --git a/src/simd/archsimd/_gen/simdgen/arch.go b/src/simd/archsimd/_gen/simdgen/arch.go
index 73fd149..794d522 100644
--- a/src/simd/archsimd/_gen/simdgen/arch.go
+++ b/src/simd/archsimd/_gen/simdgen/arch.go
@@ -62,6 +62,7 @@

var arm64RegInfoKeys = []string{
"v11",
+ "v11ImmIn1",
"v11ScalarImmIn1",
"v21",
"vgpImmIn1",
@@ -77,11 +78,12 @@
}

var arm64RegInfoSet = map[string]bool{
- "v11": true,
- "v21": true,
- "vgp": true,
- "vgpv": true,
- "vfpv": true,
+ "v11": true,
+ "v21": true,
+ "vgp": true,
+ "vgpv": true,
+ "vfpv": true,
+ "v11ImmIn1": true,
}

// arm64Arrangements contains the SIMD arrangement suffixes for ARM64 NEON.
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
index 0395e69..6187bd5 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
@@ -9,3 +9,31 @@
commutative: false
documentation: !string |-
// NAME sets a single constant-indexed element's value.
+
+- go: Broadcast1To16
+ constImm: 0
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 16 elements of
+ // the output vector.
+
+- go: Broadcast1To8
+ constImm: 0
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 8 elements of
+ // the output vector.
+
+- go: Broadcast1To4
+ constImm: 0
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 4 elements of
+ // the output vector.
+
+- go: Broadcast1To2
+ constImm: 0
+ commutative: false
+ documentation: !string |-
+ // NAME copies the lowest element of its input to all 2 elements of
+ // the output vector.
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
index 29dd876..8394406 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
@@ -70,3 +70,54 @@
out:
- *vector

+# Broadcast1To16 VDUP (duplicate element 0 to all 16 byte lanes)
+- go: Broadcast1To16
+ asm: "VDUP"
+ SSAVariant: "bcast"
+ in:
+ - *imm0
+ - &bcast8
+ go: $t
+ bits: 128
+ elemBits: 8
+ out:
+ - *bcast8
+
+# Broadcast1To8 VDUP (duplicate element 0 to all 8 halfword lanes)
+- go: Broadcast1To8
+ asm: "VDUP"
+ SSAVariant: "bcast"
+ in:
+ - *imm0
+ - &bcast16
+ go: $t
+ bits: 128
+ elemBits: 16
+ out:
+ - *bcast16
+
+# Broadcast1To4 VDUP (duplicate element 0 to all 4 word lanes)
+- go: Broadcast1To4
+ asm: "VDUP"
+ SSAVariant: "bcast"
+ in:
+ - *imm0
+ - &bcast32
+ go: $t
+ bits: 128
+ elemBits: 32
+ out:
+ - *bcast32
+
+# Broadcast1To2 VDUP (duplicate element 0 to both doubleword lanes)
+- go: Broadcast1To2
+ asm: "VDUP"
+ SSAVariant: "bcast"
+ in:
+ - *imm0
+ - &bcast64
+ go: $t
+ bits: 128
+ elemBits: 64
+ out:
+ - *bcast64
diff --git a/src/simd/archsimd/_gen/simdgen/gen_utility.go b/src/simd/archsimd/_gen/simdgen/gen_utility.go
index 2717916..97aa632 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_utility.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_utility.go
@@ -732,6 +732,9 @@

if immType == ConstImm || immType == ConstVarImm {
op.In[0].Const = op.ConstImm
+ // If the category declares a constImm, the immediate is fully constant;
+ // clear ImmOffset to ensure this is treated as ConstImm (no aux field).
+ op.In[0].ImmOffset = nil
}
// Otherwise, just not port it - e.g. {VPCMP[BWDQ] imm=0} and {VPCMPEQ[BWDQ]} are
// the same operations "Equal", [dedupgodef] should be able to distinguish them.
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index 34c29fe..12cd855 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -899,6 +899,24 @@
}
`)

+var broadcastTemplateArm64 = shapedTemplateOf(arm64Shapes, "arm64_broadcast", `
+// Broadcast{{.VType}} returns a vector with the input
+// x assigned to all elements of the output.
+func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
+ var z {{.VType}}
+ return z.SetElem(0, x).Broadcast1To{{.Count}}()
+}
+`)
+
+var stringTemplateArm64 = shapedTemplateOf(arm64Shapes, "arm64_String methods", `
+// String returns a string representation of SIMD vector x.
+func (x {{.VType}}) String() string {
+ var s [{{.Count}}]{{.Etype}}
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+`)
+
var maskCvtTemplate = shapedTemplateOf(intShapes, "Mask conversions", `
// ToMask converts from {{.Base}}{{.WxC}} to Mask{{.WxC}}, mask element is set to true when the corresponding vector element is non-zero.
func (from {{.Base}}{{.WxC}}) ToMask() (to Mask{{.WxC}}) {
@@ -933,6 +951,7 @@
// ARM64-specific
bhArm64 := flag.String("bhArm64", TD+"arm64_binary_helpers_test.go", "file name for ARM64 binary test helpers")
slArm64 := flag.String("slArm64", SIMD+"slice_gen_arm64.go", "file name for ARM64 slice operations")
+ opArm64 := flag.String("opArm64", SIMD+"other_gen_arm64.go", "file name for ARM64 other operations")
flag.Parse()

if *sl != "" {
@@ -1000,6 +1019,9 @@
if *bhArm64 != "" {
one(*bhArm64, curryTestPrologue("binary simd methods", "arm64"), binaryTemplateArm64)
}
+ if *opArm64 != "" {
+ one(*opArm64, prologue, broadcastTemplateArm64, stringTemplateArm64)
+ }

nonTemplateRewrites(SSA+"tern_helpers.go", ssaPrologue, classifyBooleanSIMD, ternOpForLogical)

diff --git a/src/simd/archsimd/internal/simd_test/arm64_simd_test.go b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
new file mode 100644
index 0000000..d55aa0f
--- /dev/null
+++ b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
@@ -0,0 +1,103 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && arm64
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+func TestBroadcastUint32x4(t *testing.T) {
+ s := make([]uint32, 4, 4)
+ archsimd.BroadcastUint32x4(123456789).StoreSlice(s)
+ checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat32x4(t *testing.T) {
+ s := make([]float32, 4, 4)
+ archsimd.BroadcastFloat32x4(3.14).StoreSlice(s)
+ checkSlices(t, s, []float32{3.14, 3.14, 3.14, 3.14})
+}
+
+func TestBroadcastFloat64x2(t *testing.T) {
+ s := make([]float64, 2, 2)
+ archsimd.BroadcastFloat64x2(3.14).StoreSlice(s)
+ checkSlices(t, s, []float64{3.14, 3.14})
+}
+
+func TestBroadcastUint64x2(t *testing.T) {
+ s := make([]uint64, 2, 2)
+ archsimd.BroadcastUint64x2(123456789012345).StoreSlice(s)
+ checkSlices(t, s, []uint64{123456789012345, 123456789012345})
+}
+
+func TestString(t *testing.T) {
+ x := archsimd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
+ y := archsimd.LoadInt64x2Slice([]int64{-44, -5})
+ z := archsimd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
+ w := archsimd.LoadFloat64x2Slice([]float64{-2.5, 3.5e9})
+
+ sx := "{0,1,2,3}"
+ sy := "{-44,-5}"
+ sz := "{0.5,1.5,-2.5,3.5e+09}"
+ sw := "{-2.5,3.5e+09}"
+
+ if x.String() != sx {
+ t.Errorf("x=%s wanted %s", x, sx)
+ }
+ if y.String() != sy {
+ t.Errorf("y=%s wanted %s", y, sy)
+ }
+ if z.String() != sz {
+ t.Errorf("z=%s wanted %s", z, sz)
+ }
+ if w.String() != sw {
+ t.Errorf("w=%s wanted %s", w, sw)
+ }
+ t.Logf("w=%s", w)
+ t.Logf("x=%s", x)
+ t.Logf("y=%s", y)
+ t.Logf("z=%s", z)
+}
+
+func TestBroadcastUint16x8(t *testing.T) {
+ s := make([]uint16, 8, 8)
+ archsimd.BroadcastUint16x8(12345).StoreSlice(s)
+ checkSlices(t, s, []uint16{12345, 12345, 12345, 12345, 12345, 12345, 12345, 12345})
+}
+
+func TestBroadcastInt8x16(t *testing.T) {
+ s := make([]int8, 16, 16)
+ archsimd.BroadcastInt8x16(-123).StoreSlice(s)
+ checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123})
+}
+
+func TestBroadcastUint8x16(t *testing.T) {
+ s := make([]uint8, 16, 16)
+ archsimd.BroadcastUint8x16(200).StoreSlice(s)
+ checkSlices(t, s, []uint8{200, 200, 200, 200, 200, 200, 200, 200,
+ 200, 200, 200, 200, 200, 200, 200, 200})
+}
+
+func TestBroadcastInt16x8(t *testing.T) {
+ s := make([]int16, 8, 8)
+ archsimd.BroadcastInt16x8(-12345).StoreSlice(s)
+ checkSlices(t, s, []int16{-12345, -12345, -12345, -12345, -12345, -12345, -12345, -12345})
+}
+
+func TestBroadcastInt32x4(t *testing.T) {
+ s := make([]int32, 4, 4)
+ archsimd.BroadcastInt32x4(-123456789).StoreSlice(s)
+ checkSlices(t, s, []int32{-123456789, -123456789, -123456789, -123456789})
+}
+
+func TestBroadcastInt64x2(t *testing.T) {
+ s := make([]int64, 2, 2)
+ archsimd.BroadcastInt64x2(-123456789).StoreSlice(s)
+ checkSlices(t, s, []int64{-123456789, -123456789})
+}
diff --git a/src/simd/archsimd/ops_arm64.go b/src/simd/archsimd/ops_arm64.go
index 66c3d36..57af31b 100644
--- a/src/simd/archsimd/ops_arm64.go
+++ b/src/simd/archsimd/ops_arm64.go
@@ -55,6 +55,74 @@
// Asm: VADD, CPU Feature: NEON
func (x Uint64x2) Add(y Uint64x2) Uint64x2

+/* Broadcast1To2 */
+
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Float64x2) Broadcast1To2() Float64x2
+
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int64x2) Broadcast1To2() Int64x2
+
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint64x2) Broadcast1To2() Uint64x2
+
+/* Broadcast1To4 */
+
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Float32x4) Broadcast1To4() Float32x4
+
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int32x4) Broadcast1To4() Int32x4
+
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint32x4) Broadcast1To4() Uint32x4
+
+/* Broadcast1To8 */
+
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int16x8) Broadcast1To8() Int16x8
+
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint16x8) Broadcast1To8() Uint16x8
+
+/* Broadcast1To16 */
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int8x16) Broadcast1To16() Int8x16
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint8x16) Broadcast1To16() Uint8x16
+
/* GetElem */

// GetElem retrieves a single constant-indexed element's value.
diff --git a/src/simd/archsimd/other_gen_arm64.go b/src/simd/archsimd/other_gen_arm64.go
new file mode 100644
index 0000000..daea458
--- /dev/null
+++ b/src/simd/archsimd/other_gen_arm64.go
@@ -0,0 +1,145 @@
+// Code generated by 'tmplgen'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// BroadcastInt8x16 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt8x16(x int8) Int8x16 {
+ var z Int8x16
+ return z.SetElem(0, x).Broadcast1To16()
+}
+
+// BroadcastInt16x8 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt16x8(x int16) Int16x8 {
+ var z Int16x8
+ return z.SetElem(0, x).Broadcast1To8()
+}
+
+// BroadcastInt32x4 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt32x4(x int32) Int32x4 {
+ var z Int32x4
+ return z.SetElem(0, x).Broadcast1To4()
+}
+
+// BroadcastInt64x2 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt64x2(x int64) Int64x2 {
+ var z Int64x2
+ return z.SetElem(0, x).Broadcast1To2()
+}
+
+// BroadcastUint8x16 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint8x16(x uint8) Uint8x16 {
+ var z Uint8x16
+ return z.SetElem(0, x).Broadcast1To16()
+}
+
+// BroadcastUint16x8 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint16x8(x uint16) Uint16x8 {
+ var z Uint16x8
+ return z.SetElem(0, x).Broadcast1To8()
+}
+
+// BroadcastUint32x4 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint32x4(x uint32) Uint32x4 {
+ var z Uint32x4
+ return z.SetElem(0, x).Broadcast1To4()
+}
+
+// BroadcastUint64x2 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint64x2(x uint64) Uint64x2 {
+ var z Uint64x2
+ return z.SetElem(0, x).Broadcast1To2()
+}
+
+// BroadcastFloat32x4 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastFloat32x4(x float32) Float32x4 {
+ var z Float32x4
+ return z.SetElem(0, x).Broadcast1To4()
+}
+
+// BroadcastFloat64x2 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastFloat64x2(x float64) Float64x2 {
+ var z Float64x2
+ return z.SetElem(0, x).Broadcast1To2()
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int8x16) String() string {
+ var s [16]int8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int16x8) String() string {
+ var s [8]int16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int32x4) String() string {
+ var s [4]int32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int64x2) String() string {
+ var s [2]int64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint8x16) String() string {
+ var s [16]uint8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint16x8) String() string {
+ var s [8]uint16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint32x4) String() string {
+ var s [4]uint32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint64x2) String() string {
+ var s [2]uint64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Float32x4) String() string {
+ var s [4]float32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Float64x2) String() string {
+ var s [2]float64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
diff --git a/src/simd/archsimd/string.go b/src/simd/archsimd/string.go
index 77500ad..7dc766b 100644
--- a/src/simd/archsimd/string.go
+++ b/src/simd/archsimd/string.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

-//go:build goexperiment.simd && amd64
+//go:build goexperiment.simd

package archsimd

Change information

Files:
  • M src/cmd/compile/internal/arm64/simdssa.go
  • M src/cmd/compile/internal/arm64/ssa.go
  • M src/cmd/compile/internal/ssa/_gen/simdARM64.rules
  • M src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
  • M src/cmd/compile/internal/ssa/opGen.go
  • M src/cmd/compile/internal/ssa/rewriteARM64.go
  • M src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
  • M src/simd/archsimd/_gen/simdgen/arch.go
  • M src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
  • M src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
  • M src/simd/archsimd/_gen/simdgen/gen_utility.go
  • M src/simd/archsimd/_gen/tmplgen/main.go
  • A src/simd/archsimd/internal/simd_test/arm64_simd_test.go
  • M src/simd/archsimd/ops_arm64.go
  • A src/simd/archsimd/other_gen_arm64.go
  • M src/simd/archsimd/string.go
Change size: L
Delta: 16 files changed, 693 insertions(+), 6 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: Ica3e6c06e4beebd288e82ff961a3201351ecd352
Gerrit-Change-Number: 767260
Gerrit-PatchSet: 1
Gerrit-Owner: Alexander Musman <alexande...@gmail.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

David Chase (Gerrit)

unread,
Apr 27, 2026, 4:04:24 PM (2 days ago) Apr 27
to Alexander Musman, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
Attention needed from Alexander Musman

David Chase added 2 comments

Patchset-level comments
File-level comment, Patchset 1 (Latest):
David Chase . resolved

I am cherry-picking my way up the stack, and commenting as I go.

File src/simd/archsimd/_gen/simdgen/gen_utility.go
Line 737, Patchset 1 (Latest): op.In[0].ImmOffset = nil
David Chase . unresolved

This change breaks generation for AMD64 -- all sorts of operations lose their immediate operands. I tried reversing it and that led to different breakage generating files for ARM64.

After much "experimentation" the "fix" that I arrived at was to choose a hacky name for the immediate operands for arm64 Broadcast operations (I used "@") and if the immediate operand had that name, then set the offset to nil. This will fail to unify with the names assigned in arm64/emit.go at line 68, however those names are not good -- the don't allow any other name to be specified in the yaml file, whether an informative one like "lane" or "precision" or a magic hacky one like "@".

Open in Gerrit

Related details

Attention is currently required from:
  • Alexander Musman
Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement is not satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement is not satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: comment
    Gerrit-Project: go
    Gerrit-Branch: dev.simd
    Gerrit-Change-Id: Ica3e6c06e4beebd288e82ff961a3201351ecd352
    Gerrit-Change-Number: 767260
    Gerrit-PatchSet: 1
    Gerrit-Owner: Alexander Musman <alexande...@gmail.com>
    Gerrit-CC: David Chase <drc...@google.com>
    Gerrit-Attention: Alexander Musman <alexande...@gmail.com>
    Gerrit-Comment-Date: Mon, 27 Apr 2026 20:04:20 +0000
    Gerrit-HasComments: Yes
    Gerrit-Has-Labels: No
    unsatisfied_requirement
    open
    diffy

    Alexander Musman (Gerrit)

    unread,
    2:29 AM (3 hours ago) 2:29 AM
    to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com
    Attention needed from Alexander Musman

    Alexander Musman uploaded new patchset

    Alexander Musman uploaded patch set #2 to this change.
    Open in Gerrit

    Related details

    Attention is currently required from:
    • Alexander Musman
    Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement is not satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement is not satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: newpatchset
    Gerrit-Project: go
    Gerrit-Branch: dev.simd
    Gerrit-Change-Id: Ica3e6c06e4beebd288e82ff961a3201351ecd352
    Gerrit-Change-Number: 767260
    Gerrit-PatchSet: 2
    unsatisfied_requirement
    open
    diffy
    Reply all
    Reply to author
    Forward
    0 new messages