[go/dev.simd] [dev.simd] simd: add ARM64 NEON Broadcast and String helpers

2 views

Skip to first unread message

Alexander Musman (Gerrit)

unread,

Apr 15, 2026, 9:52:18 AM (14 days ago) Apr 15

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Alexander Musman has uploaded the change for review

Commit message

[dev.simd] simd: add ARM64 NEON Broadcast and String helpers

Add Broadcast functions that replicate a scalar value across all lanes
of a vector, using the ARM64 VDUP instruction.

This adds:
  - Broadcast1To{2,4,8,16} methods on vector types (lowering to VDUP)
  - Broadcast{Type} constructors (e.g. BroadcastFloat32x4(3.14))
  - String() methods on all ARM64 vector types

Supports all element widths: B/H/S/D for signed, unsigned, and float types.

Example demonstrating broadcast and String on ARM64:
```
package main

import (
        "fmt"
        "simd/archsimd"
)

func main() {
        v := archsimd.BroadcastFloat32x4(2.5)
        w := v.Add(archsimd.BroadcastFloat32x4(v.GetElem(3)))
        fmt.Printf("w = %s\n", w.String())
}
```
Output: w = {5,5,5,5}

Change-Id: Ica3e6c06e4beebd288e82ff961a3201351ecd352

Change diff

diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go
index bc00084..5fe63ab 100644
--- a/src/cmd/compile/internal/arm64/simdssa.go
+++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -12,6 +12,18 @@
 func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	var p *obj.Prog
 	switch v.Op {
+	case ssa.OpARM64VDUPBbcast:
+		p = simdV11ImmIn1(s, v, arm64.ARNG_B)
+
+	case ssa.OpARM64VDUPDbcast:
+		p = simdV11ImmIn1(s, v, arm64.ARNG_D)
+
+	case ssa.OpARM64VDUPHbcast:
+		p = simdV11ImmIn1(s, v, arm64.ARNG_H)
+
+	case ssa.OpARM64VDUPSbcast:
+		p = simdV11ImmIn1(s, v, arm64.ARNG_S)
+
 	case ssa.OpARM64VDUPDextr:
 		p = simdV11ScalarImmIn1(s, v, arm64.ARNG_D)
 
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 1d837d9..3e03c7f 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -198,6 +198,24 @@
 	return
 }
 
+// allLanes converts an element arrangement to its 128-bit vector arrangement.
+// e.g., ARNG_B -> ARNG_16B, ARNG_S -> ARNG_4S
+func allLanes(arng int16) int16 {
+	switch arng {
+	case arm64.ARNG_B:
+		return arm64.ARNG_16B
+	case arm64.ARNG_H:
+		return arm64.ARNG_8H
+	case arm64.ARNG_S:
+		return arm64.ARNG_4S
+	case arm64.ARNG_D:
+		return arm64.ARNG_2D
+	default:
+		base.Fatalf("unsupported element arrangement: %d", arng)
+		return 0
+	}
+}
+
 // simdV01Imm generates a VMOVI-like instruction, e.g. VMOVI $0, V0.B16
 func simdV01Imm(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
 	p := s.Prog(v.Op.Asm())
@@ -223,6 +241,17 @@
 	return simdV11Asm(s, v.Op.Asm(), v.Args[0].Reg(), v.Reg(), arrangement)
 }
 
+// simdV11ImmIn1 generates a Broadcast1ToN instruction,
+// e.g. VDUP V1.S[0], V0.S4 (duplicate element 0 to all lanes)
+// The arrangement parameter specifies the element arrangement (e.g., ARNG_S, ARNG_D)
+func simdV11ImmIn1(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
+	p := s.Prog(v.Op.Asm())
+	p.From = simdRegElem(v.Args[0].Reg(), arrangement, int16(v.AuxUInt8()))
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdRegArng(v.Reg(), allLanes(arrangement))
+	return p
+}
+
 // simdV11Scalar generates vector-to-scalar reduction operations, e.g. VUADDLV V1.B8, V0
 func simdV11Scalar(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
 	p := s.Prog(v.Op.Asm())
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
index b94026b..e5165a9 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -10,6 +10,16 @@
 (AddUint16x8 ...) => (VADD8H ...)
 (AddUint32x4 ...) => (VADD4S ...)
 (AddUint64x2 ...) => (VADD2D ...)
+(Broadcast1To2Float64x2 x) => (VDUPDbcast [0] x)
+(Broadcast1To2Int64x2 x) => (VDUPDbcast [0] x)
+(Broadcast1To2Uint64x2 x) => (VDUPDbcast [0] x)
+(Broadcast1To4Float32x4 x) => (VDUPSbcast [0] x)
+(Broadcast1To4Int32x4 x) => (VDUPSbcast [0] x)
+(Broadcast1To4Uint32x4 x) => (VDUPSbcast [0] x)
+(Broadcast1To8Int16x8 x) => (VDUPHbcast [0] x)
+(Broadcast1To8Uint16x8 x) => (VDUPHbcast [0] x)
+(Broadcast1To16Int8x16 x) => (VDUPBbcast [0] x)
+(Broadcast1To16Uint8x16 x) => (VDUPBbcast [0] x)
 (GetElemFloat32x4 ...) => (VDUPSextr ...)
 (GetElemFloat64x2 ...) => (VDUPDextr ...)
 (GetElemInt8x16 ...) => (VMOVBextr ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
index 2961212..e66c68c 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -15,7 +15,11 @@
 		{name: "VMUL4S", argLength: 2, reg: v21, asm: "VMUL", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VMUL8H", argLength: 2, reg: v21, asm: "VMUL", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VMUL16B", argLength: 2, reg: v21, asm: "VMUL", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VDUPBbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VDUPDbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VDUPDextr", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VDUPHbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VDUPSbcast", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VDUPSextr", argLength: 1, reg: v11, asm: "VDUP", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VMOVBextr", argLength: 1, reg: vgp, asm: "VMOV", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
 		{name: "VMOVBins", argLength: 2, reg: vgpv, asm: "VMOV", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 3fe2d81..08fe804 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -4507,7 +4507,11 @@
 	OpARM64VMUL4S
 	OpARM64VMUL8H
 	OpARM64VMUL16B
+	OpARM64VDUPBbcast
+	OpARM64VDUPDbcast
 	OpARM64VDUPDextr
+	OpARM64VDUPHbcast
+	OpARM64VDUPSbcast
 	OpARM64VDUPSextr
 	OpARM64VMOVBextr
 	OpARM64VMOVBins
@@ -70047,6 +70051,34 @@
 		},
 	},
 	{
+		name:    "VDUPBbcast",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     arm64.AVDUP,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
+		name:    "VDUPDbcast",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     arm64.AVDUP,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
 		name:    "VDUPDextr",
 		auxType: auxUInt8,
 		argLen:  1,
@@ -70061,6 +70093,34 @@
 		},
 	},
 	{
+		name:    "VDUPHbcast",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     arm64.AVDUP,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
+		name:    "VDUPSbcast",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     arm64.AVDUP,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+			outputs: []outputInfo{
+				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+			},
+		},
+	},
+	{
 		name:    "VDUPSextr",
 		auxType: auxUInt8,
 		argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index d4e4fdb..009238c 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -603,6 +603,26 @@
 		return true
 	case OpBitRev8:
 		return rewriteValueARM64_OpBitRev8(v)
+	case OpBroadcast1To16Int8x16:
+		return rewriteValueARM64_OpBroadcast1To16Int8x16(v)
+	case OpBroadcast1To16Uint8x16:
+		return rewriteValueARM64_OpBroadcast1To16Uint8x16(v)
+	case OpBroadcast1To2Float64x2:
+		return rewriteValueARM64_OpBroadcast1To2Float64x2(v)
+	case OpBroadcast1To2Int64x2:
+		return rewriteValueARM64_OpBroadcast1To2Int64x2(v)
+	case OpBroadcast1To2Uint64x2:
+		return rewriteValueARM64_OpBroadcast1To2Uint64x2(v)
+	case OpBroadcast1To4Float32x4:
+		return rewriteValueARM64_OpBroadcast1To4Float32x4(v)
+	case OpBroadcast1To4Int32x4:
+		return rewriteValueARM64_OpBroadcast1To4Int32x4(v)
+	case OpBroadcast1To4Uint32x4:
+		return rewriteValueARM64_OpBroadcast1To4Uint32x4(v)
+	case OpBroadcast1To8Int16x8:
+		return rewriteValueARM64_OpBroadcast1To8Int16x8(v)
+	case OpBroadcast1To8Uint16x8:
+		return rewriteValueARM64_OpBroadcast1To8Uint16x8(v)
 	case OpBswap16:
 		v.Op = OpARM64REV16W
 		return true
@@ -17508,6 +17528,126 @@
 		return true
 	}
 }
+func rewriteValueARM64_OpBroadcast1To16Int8x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To16Int8x16 x)
+	// result: (VDUPBbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPBbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To16Uint8x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To16Uint8x16 x)
+	// result: (VDUPBbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPBbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To2Float64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To2Float64x2 x)
+	// result: (VDUPDbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPDbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To2Int64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To2Int64x2 x)
+	// result: (VDUPDbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPDbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To2Uint64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To2Uint64x2 x)
+	// result: (VDUPDbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPDbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To4Float32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To4Float32x4 x)
+	// result: (VDUPSbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPSbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To4Int32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To4Int32x4 x)
+	// result: (VDUPSbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPSbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To4Uint32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To4Uint32x4 x)
+	// result: (VDUPSbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPSbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To8Int16x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To8Int16x8 x)
+	// result: (VDUPHbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPHbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueARM64_OpBroadcast1To8Uint16x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Broadcast1To8Uint16x8 x)
+	// result: (VDUPHbcast [0] x)
+	for {
+		x := v_0
+		v.reset(OpARM64VDUPHbcast)
+		v.AuxInt = uint8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueARM64_OpCondSelect(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
diff --git a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
index 1467012..d21c176 100644
--- a/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
@@ -20,6 +20,16 @@
 	addF(simdPackage, "Uint16x8.Add", opLen2(ssa.OpAddUint16x8, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint32x4.Add", opLen2(ssa.OpAddUint32x4, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.ARM64)
+	addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.ARM64)
 	addF(simdPackage, "Float32x4.GetElem", opLen1Imm(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0, 3), sys.ARM64)
 	addF(simdPackage, "Float64x2.GetElem", opLen1Imm(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0, 1), sys.ARM64)
 	addF(simdPackage, "Int8x16.GetElem", opLen1Imm(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0, 15), sys.ARM64)
diff --git a/src/simd/archsimd/_gen/simdgen/arch.go b/src/simd/archsimd/_gen/simdgen/arch.go
index 73fd149..794d522 100644
--- a/src/simd/archsimd/_gen/simdgen/arch.go
+++ b/src/simd/archsimd/_gen/simdgen/arch.go
@@ -62,6 +62,7 @@
 
 var arm64RegInfoKeys = []string{
 	"v11",
+	"v11ImmIn1",
 	"v11ScalarImmIn1",
 	"v21",
 	"vgpImmIn1",
@@ -77,11 +78,12 @@
 }
 
 var arm64RegInfoSet = map[string]bool{
-	"v11":  true,
-	"v21":  true,
-	"vgp":  true,
-	"vgpv": true,
-	"vfpv": true,
+	"v11":       true,
+	"v21":       true,
+	"vgp":       true,
+	"vgpv":      true,
+	"vfpv":      true,
+	"v11ImmIn1": true,
 }
 
 // arm64Arrangements contains the SIMD arrangement suffixes for ARM64 NEON.
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
index 0395e69..6187bd5 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
@@ -9,3 +9,31 @@
   commutative: false
   documentation: !string |-
     // NAME sets a single constant-indexed element's value.
+
+- go: Broadcast1To16
+  constImm: 0
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 16 elements of
+    // the output vector.
+
+- go: Broadcast1To8
+  constImm: 0
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 8 elements of
+    // the output vector.
+
+- go: Broadcast1To4
+  constImm: 0
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 4 elements of
+    // the output vector.
+
+- go: Broadcast1To2
+  constImm: 0
+  commutative: false
+  documentation: !string |-
+    // NAME copies the lowest element of its input to all 2 elements of
+    // the output vector.
diff --git a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
index 29dd876..8394406 100644
--- a/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
@@ -70,3 +70,54 @@
   out:
   - *vector
 
+# Broadcast1To16 VDUP (duplicate element 0 to all 16 byte lanes)
+- go: Broadcast1To16
+  asm: "VDUP"
+  SSAVariant: "bcast"
+  in:
+  - *imm0
+  - &bcast8
+    go: $t
+    bits: 128
+    elemBits: 8
+  out:
+  - *bcast8
+
+# Broadcast1To8 VDUP (duplicate element 0 to all 8 halfword lanes)
+- go: Broadcast1To8
+  asm: "VDUP"
+  SSAVariant: "bcast"
+  in:
+  - *imm0
+  - &bcast16
+    go: $t
+    bits: 128
+    elemBits: 16
+  out:
+  - *bcast16
+
+# Broadcast1To4 VDUP (duplicate element 0 to all 4 word lanes)
+- go: Broadcast1To4
+  asm: "VDUP"
+  SSAVariant: "bcast"
+  in:
+  - *imm0
+  - &bcast32
+    go: $t
+    bits: 128
+    elemBits: 32
+  out:
+  - *bcast32
+
+# Broadcast1To2 VDUP (duplicate element 0 to both doubleword lanes)
+- go: Broadcast1To2
+  asm: "VDUP"
+  SSAVariant: "bcast"
+  in:
+  - *imm0
+  - &bcast64
+    go: $t
+    bits: 128
+    elemBits: 64
+  out:
+  - *bcast64
diff --git a/src/simd/archsimd/_gen/simdgen/gen_utility.go b/src/simd/archsimd/_gen/simdgen/gen_utility.go
index 2717916..97aa632 100644
--- a/src/simd/archsimd/_gen/simdgen/gen_utility.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_utility.go
@@ -732,6 +732,9 @@
 
 		if immType == ConstImm || immType == ConstVarImm {
 			op.In[0].Const = op.ConstImm
+			// If the category declares a constImm, the immediate is fully constant;
+			// clear ImmOffset to ensure this is treated as ConstImm (no aux field).
+			op.In[0].ImmOffset = nil
 		}
 		// Otherwise, just not port it - e.g. {VPCMP[BWDQ] imm=0} and {VPCMPEQ[BWDQ]} are
 		// the same operations "Equal", [dedupgodef] should be able to distinguish them.
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index 34c29fe..12cd855 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -899,6 +899,24 @@
 }
 `)
 
+var broadcastTemplateArm64 = shapedTemplateOf(arm64Shapes, "arm64_broadcast", `
+// Broadcast{{.VType}} returns a vector with the input
+// x assigned to all elements of the output.
+func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
+	var z {{.VType}}
+	return z.SetElem(0, x).Broadcast1To{{.Count}}()
+}
+`)
+
+var stringTemplateArm64 = shapedTemplateOf(arm64Shapes, "arm64_String methods", `
+// String returns a string representation of SIMD vector x.
+func (x {{.VType}}) String() string {
+	var s [{{.Count}}]{{.Etype}}
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+`)
+
 var maskCvtTemplate = shapedTemplateOf(intShapes, "Mask conversions", `
 // ToMask converts from {{.Base}}{{.WxC}} to Mask{{.WxC}}, mask element is set to true when the corresponding vector element is non-zero.
 func (from {{.Base}}{{.WxC}}) ToMask() (to Mask{{.WxC}}) {
@@ -933,6 +951,7 @@
 	// ARM64-specific
 	bhArm64 := flag.String("bhArm64", TD+"arm64_binary_helpers_test.go", "file name for ARM64 binary test helpers")
 	slArm64 := flag.String("slArm64", SIMD+"slice_gen_arm64.go", "file name for ARM64 slice operations")
+	opArm64 := flag.String("opArm64", SIMD+"other_gen_arm64.go", "file name for ARM64 other operations")
 	flag.Parse()
 
 	if *sl != "" {
@@ -1000,6 +1019,9 @@
 	if *bhArm64 != "" {
 		one(*bhArm64, curryTestPrologue("binary simd methods", "arm64"), binaryTemplateArm64)
 	}
+	if *opArm64 != "" {
+		one(*opArm64, prologue, broadcastTemplateArm64, stringTemplateArm64)
+	}
 
 	nonTemplateRewrites(SSA+"tern_helpers.go", ssaPrologue, classifyBooleanSIMD, ternOpForLogical)
 
diff --git a/src/simd/archsimd/internal/simd_test/arm64_simd_test.go b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
new file mode 100644
index 0000000..d55aa0f
--- /dev/null
+++ b/src/simd/archsimd/internal/simd_test/arm64_simd_test.go
@@ -0,0 +1,103 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && arm64
+
+package simd_test
+
+import (
+	"simd/archsimd"
+	"testing"
+)
+
+func TestBroadcastUint32x4(t *testing.T) {
+	s := make([]uint32, 4, 4)
+	archsimd.BroadcastUint32x4(123456789).StoreSlice(s)
+	checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat32x4(t *testing.T) {
+	s := make([]float32, 4, 4)
+	archsimd.BroadcastFloat32x4(3.14).StoreSlice(s)
+	checkSlices(t, s, []float32{3.14, 3.14, 3.14, 3.14})
+}
+
+func TestBroadcastFloat64x2(t *testing.T) {
+	s := make([]float64, 2, 2)
+	archsimd.BroadcastFloat64x2(3.14).StoreSlice(s)
+	checkSlices(t, s, []float64{3.14, 3.14})
+}
+
+func TestBroadcastUint64x2(t *testing.T) {
+	s := make([]uint64, 2, 2)
+	archsimd.BroadcastUint64x2(123456789012345).StoreSlice(s)
+	checkSlices(t, s, []uint64{123456789012345, 123456789012345})
+}
+
+func TestString(t *testing.T) {
+	x := archsimd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
+	y := archsimd.LoadInt64x2Slice([]int64{-44, -5})
+	z := archsimd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
+	w := archsimd.LoadFloat64x2Slice([]float64{-2.5, 3.5e9})
+
+	sx := "{0,1,2,3}"
+	sy := "{-44,-5}"
+	sz := "{0.5,1.5,-2.5,3.5e+09}"
+	sw := "{-2.5,3.5e+09}"
+
+	if x.String() != sx {
+		t.Errorf("x=%s wanted %s", x, sx)
+	}
+	if y.String() != sy {
+		t.Errorf("y=%s wanted %s", y, sy)
+	}
+	if z.String() != sz {
+		t.Errorf("z=%s wanted %s", z, sz)
+	}
+	if w.String() != sw {
+		t.Errorf("w=%s wanted %s", w, sw)
+	}
+	t.Logf("w=%s", w)
+	t.Logf("x=%s", x)
+	t.Logf("y=%s", y)
+	t.Logf("z=%s", z)
+}
+
+func TestBroadcastUint16x8(t *testing.T) {
+	s := make([]uint16, 8, 8)
+	archsimd.BroadcastUint16x8(12345).StoreSlice(s)
+	checkSlices(t, s, []uint16{12345, 12345, 12345, 12345, 12345, 12345, 12345, 12345})
+}
+
+func TestBroadcastInt8x16(t *testing.T) {
+	s := make([]int8, 16, 16)
+	archsimd.BroadcastInt8x16(-123).StoreSlice(s)
+	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
+		-123, -123, -123, -123, -123, -123, -123, -123})
+}
+
+func TestBroadcastUint8x16(t *testing.T) {
+	s := make([]uint8, 16, 16)
+	archsimd.BroadcastUint8x16(200).StoreSlice(s)
+	checkSlices(t, s, []uint8{200, 200, 200, 200, 200, 200, 200, 200,
+		200, 200, 200, 200, 200, 200, 200, 200})
+}
+
+func TestBroadcastInt16x8(t *testing.T) {
+	s := make([]int16, 8, 8)
+	archsimd.BroadcastInt16x8(-12345).StoreSlice(s)
+	checkSlices(t, s, []int16{-12345, -12345, -12345, -12345, -12345, -12345, -12345, -12345})
+}
+
+func TestBroadcastInt32x4(t *testing.T) {
+	s := make([]int32, 4, 4)
+	archsimd.BroadcastInt32x4(-123456789).StoreSlice(s)
+	checkSlices(t, s, []int32{-123456789, -123456789, -123456789, -123456789})
+}
+
+func TestBroadcastInt64x2(t *testing.T) {
+	s := make([]int64, 2, 2)
+	archsimd.BroadcastInt64x2(-123456789).StoreSlice(s)
+	checkSlices(t, s, []int64{-123456789, -123456789})
+}
diff --git a/src/simd/archsimd/ops_arm64.go b/src/simd/archsimd/ops_arm64.go
index 66c3d36..57af31b 100644
--- a/src/simd/archsimd/ops_arm64.go
+++ b/src/simd/archsimd/ops_arm64.go
@@ -55,6 +55,74 @@
 // Asm: VADD, CPU Feature: NEON
 func (x Uint64x2) Add(y Uint64x2) Uint64x2
 
+/* Broadcast1To2 */
+
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Float64x2) Broadcast1To2() Float64x2
+
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int64x2) Broadcast1To2() Int64x2
+
+// Broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint64x2) Broadcast1To2() Uint64x2
+
+/* Broadcast1To4 */
+
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Float32x4) Broadcast1To4() Float32x4
+
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int32x4) Broadcast1To4() Int32x4
+
+// Broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint32x4) Broadcast1To4() Uint32x4
+
+/* Broadcast1To8 */
+
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int16x8) Broadcast1To8() Int16x8
+
+// Broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint16x8) Broadcast1To8() Uint16x8
+
+/* Broadcast1To16 */
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Int8x16) Broadcast1To16() Int8x16
+
+// Broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VDUP, CPU Feature: NEON
+func (x Uint8x16) Broadcast1To16() Uint8x16
+
 /* GetElem */
 
 // GetElem retrieves a single constant-indexed element's value.
diff --git a/src/simd/archsimd/other_gen_arm64.go b/src/simd/archsimd/other_gen_arm64.go
new file mode 100644
index 0000000..daea458
--- /dev/null
+++ b/src/simd/archsimd/other_gen_arm64.go
@@ -0,0 +1,145 @@
+// Code generated by 'tmplgen'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// BroadcastInt8x16 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt8x16(x int8) Int8x16 {
+	var z Int8x16
+	return z.SetElem(0, x).Broadcast1To16()
+}
+
+// BroadcastInt16x8 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt16x8(x int16) Int16x8 {
+	var z Int16x8
+	return z.SetElem(0, x).Broadcast1To8()
+}
+
+// BroadcastInt32x4 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt32x4(x int32) Int32x4 {
+	var z Int32x4
+	return z.SetElem(0, x).Broadcast1To4()
+}
+
+// BroadcastInt64x2 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastInt64x2(x int64) Int64x2 {
+	var z Int64x2
+	return z.SetElem(0, x).Broadcast1To2()
+}
+
+// BroadcastUint8x16 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint8x16(x uint8) Uint8x16 {
+	var z Uint8x16
+	return z.SetElem(0, x).Broadcast1To16()
+}
+
+// BroadcastUint16x8 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint16x8(x uint16) Uint16x8 {
+	var z Uint16x8
+	return z.SetElem(0, x).Broadcast1To8()
+}
+
+// BroadcastUint32x4 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint32x4(x uint32) Uint32x4 {
+	var z Uint32x4
+	return z.SetElem(0, x).Broadcast1To4()
+}
+
+// BroadcastUint64x2 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastUint64x2(x uint64) Uint64x2 {
+	var z Uint64x2
+	return z.SetElem(0, x).Broadcast1To2()
+}
+
+// BroadcastFloat32x4 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastFloat32x4(x float32) Float32x4 {
+	var z Float32x4
+	return z.SetElem(0, x).Broadcast1To4()
+}
+
+// BroadcastFloat64x2 returns a vector with the input
+// x assigned to all elements of the output.
+func BroadcastFloat64x2(x float64) Float64x2 {
+	var z Float64x2
+	return z.SetElem(0, x).Broadcast1To2()
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int8x16) String() string {
+	var s [16]int8
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int16x8) String() string {
+	var s [8]int16
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int32x4) String() string {
+	var s [4]int32
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Int64x2) String() string {
+	var s [2]int64
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint8x16) String() string {
+	var s [16]uint8
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint16x8) String() string {
+	var s [8]uint16
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint32x4) String() string {
+	var s [4]uint32
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Uint64x2) String() string {
+	var s [2]uint64
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Float32x4) String() string {
+	var s [4]float32
+	x.Store(&s)
+	return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x.
+func (x Float64x2) String() string {
+	var s [2]float64
+	x.Store(&s)
+	return sliceToString(s[:])
+}
diff --git a/src/simd/archsimd/string.go b/src/simd/archsimd/string.go
index 77500ad..7dc766b 100644
--- a/src/simd/archsimd/string.go
+++ b/src/simd/archsimd/string.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build goexperiment.simd && amd64
+//go:build goexperiment.simd
 
 package archsimd

Change information

Files:

M src/cmd/compile/internal/arm64/simdssa.go
M src/cmd/compile/internal/arm64/ssa.go
M src/cmd/compile/internal/ssa/_gen/simdARM64.rules
M src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
M src/cmd/compile/internal/ssa/opGen.go
M src/cmd/compile/internal/ssa/rewriteARM64.go
M src/cmd/compile/internal/ssagen/simdARM64intrinsics.go
M src/simd/archsimd/_gen/simdgen/arch.go
M src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/categories.yaml
M src/simd/archsimd/_gen/simdgen/arm64/ops/Moves/go.yaml
M src/simd/archsimd/_gen/simdgen/gen_utility.go
M src/simd/archsimd/_gen/tmplgen/main.go
A src/simd/archsimd/internal/simd_test/arm64_simd_test.go
M src/simd/archsimd/ops_arm64.go
A src/simd/archsimd/other_gen_arm64.go
M src/simd/archsimd/string.go

Change size: L

Delta: 16 files changed, 693 insertions(+), 6 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

David Chase (Gerrit)

unread,

Apr 27, 2026, 4:04:24 PM (2 days ago) Apr 27

to Alexander Musman, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Alexander Musman

David Chase added 2 comments

Patchset-level comments

File-level comment, Patchset 1 (Latest):

David Chase . resolved

I am cherry-picking my way up the stack, and commenting as I go.

File src/simd/archsimd/_gen/simdgen/gen_utility.go

Line 737, Patchset 1 (Latest): op.In[0].ImmOffset = nil

David Chase . unresolved

This change breaks generation for AMD64 -- all sorts of operations lose their immediate operands. I tried reversing it and that led to different breakage generating files for ARM64.

After much "experimentation" the "fix" that I arrived at was to choose a hacky name for the immediate operands for arm64 Broadcast operations (I used "@") and if the immediate operand had that name, then set the offset to nil. This will fail to unify with the names assigned in arm64/emit.go at line 68, however those names are not good -- the don't allow any other name to be specified in the yaml file, whether an informative one like "lane" or "precision" or a magic hacky one like "@".

Open in Gerrit

Related details

Attention is currently required from:

Alexander Musman

Submit Requirements:

Code-Review

No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Alexander Musman (Gerrit)

unread,

2:29 AM (3 hours ago) 2:29 AM

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Alexander Musman

Alexander Musman uploaded new patchset

Alexander Musman uploaded patch set #2 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Alexander Musman

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Reply all

Reply to author

Forward

0 new messages