[dev.simd] simd: arm64 neon toy example
diff --git a/src/cmd/compile/internal/arm64/simdssa.go b/src/cmd/compile/internal/arm64/simdssa.go
new file mode 100644
index 0000000..a948719
--- /dev/null
+++ b/src/cmd/compile/internal/arm64/simdssa.go
@@ -0,0 +1,33 @@
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+package arm64
+
+import (
+ "cmd/compile/internal/ssa"
+ "cmd/compile/internal/ssagen"
+ "cmd/internal/obj"
+ "cmd/internal/obj/arm64"
+)
+
+func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
+ var p *obj.Prog
+ switch v.Op {
+ case ssa.OpARM64VFADD32x4,
+ ssa.OpARM64VADD32x4:
+ p = simdV21(s, v, arng_32x4)
+
+ case ssa.OpARM64VFADD64x2,
+ ssa.OpARM64VADD64x2:
+ p = simdV21(s, v, arng_64x2)
+
+ default:
+ // Unknown reg shape
+ return false
+ }
+
+ // Ensure p and architecture package are marked as used
+ // (they may not be used in all generated code paths)
+ _ = p
+ _ = arm64.REG_V0
+ return true
+}
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 534954f..4d753b6 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -21,7 +21,11 @@
// loadByType returns the load instruction of the given type.
func loadByType(t *types.Type) obj.As {
- if t.IsFloat() {
+ if t.IsSIMD() {
+ if t.Size() == 16 {
+ return arm64.AFMOVQ // Use FMOVQ (LDR Q) for 128-bit SIMD loads
+ }
+ } else if t.IsFloat() {
switch t.Size() {
case 4:
return arm64.AFMOVS
@@ -57,7 +61,11 @@
// storeByType returns the store instruction of the given type.
func storeByType(t *types.Type) obj.As {
- if t.IsFloat() {
+ if t.IsSIMD() {
+ if t.Size() == 16 {
+ return arm64.AFMOVQ // Use FMOVQ (STR Q) for 128-bit SIMD stores
+ }
+ } else if t.IsFloat() {
switch t.Size() {
case 4:
return arm64.AFMOVS
@@ -1472,7 +1480,9 @@
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
default:
- v.Fatalf("genValue not implemented: %s", v.LongString())
+ if !ssaGenSIMDValue(s, v) {
+ v.Fatalf("genValue not implemented: %s", v.LongString())
+ }
}
}
@@ -1729,3 +1739,56 @@
st.To.Reg = dst
st.To.Offset = off
}
+
+// Arrangement constants for ARM64 SIMD operations (matches cmd/internal/obj/arm64/a.out.go)
+// Currently we expect only full-width arrangements (.16B, .8H, .4S and .2D) from generator.
+const (
+ arng_8x8 = iota // ARNG_8B
+ arng_8x16 // ARNG_16B: 16 lanes of 8-bit elements
+ arng_64x1 // ARNG_1D
+ arng_16x4 // ARNG_4H
+ arng_16x8 // ARNG_8H: 8 lanes of 16-bit elements
+ arng_16x4 // ARNG_2S
+ arng_32x4 // ARNG_4S: 4 lanes of 32-bit elements
+ arng_64x2 // ARNG_2D: 2 lanes of 64-bit elements
+)
+
+// simdV21 generates a three-register SIMD instruction: op Vn, Vm, Vd
+// For example: VADD V1.4S, V0.4S, V0.4S
+// The arrangement parameter specifies the vector element arrangement (e.g., 4S, 2D)
+func simdV21(s *ssagen.State, v *ssa.Value, arrangement int16) *obj.Prog {
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg, p.From.Class = simdReg(v.Args[1], arrangement)
+ p.Reg, _ = simdReg(v.Args[0], arrangement)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg, p.To.Class = simdReg(v, arrangement)
+ return p
+}
+
+// simdReg converts an SSA SIMD register to the appropriate ARM64 register
+// with the specified arrangement (e.g., V0.4S for 32x4, V0.2D for 64x2).
+// Returns the register and corresponding addressing class.
+// Need to be consistent with ARM64RegisterExtension.
+func simdReg(v *ssa.Value, arrangement int16) (int16, int8) {
+ // Get the register number (0-31)
+ reg := v.Reg()
+ var regNum int16
+ switch {
+ case arm64.REG_F0 <= reg && reg <= arm64.REG_F31:
+ // SSA uses F registers for SIMD - convert to V register number
+ regNum = reg - arm64.REG_F0
+ case arm64.REG_V0 <= reg && reg <= arm64.REG_V31:
+ // Currently SSA uses F registers for SIMD values, detect if this assumption changes
+ panic("simdReg: got V register from SSA - this path needs investigation")
+ regNum = reg - arm64.REG_V0
+ default:
+ // Unexpected register type
+ base.Fatalf("simdReg: unexpected register %v for SIMD value", reg)
+ }
+
+ // Return V register with arrangement for SIMD arithmetic operations
+ // REG_ARNG is the base for registers with arrangement
+ // The arrangement goes in bits [8:5], register number in bits [4:0]
+ return arm64.REG_ARNG | (arrangement << 5) | regNum, arm64.C_ARNG
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
index f54a692..c184422 100644
--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
@@ -341,6 +341,7 @@
(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+(Load <t> ptr mem) && t.Size() == 16 => (FMOVQload ptr mem)
// stores
(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
@@ -349,6 +350,7 @@
(Store {t} ptr val mem) && t.Size() == 8 && !t.IsFloat() => (MOVDstore ptr val mem)
(Store {t} ptr val mem) && t.Size() == 4 && t.IsFloat() => (FMOVSstore ptr val mem)
(Store {t} ptr val mem) && t.Size() == 8 && t.IsFloat() => (FMOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 16 => (FMOVQstore ptr val mem)
// zeroing
(Zero [0] _ mem) => mem
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
index 6b1ae48..10e2040 100644
--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
@@ -802,7 +802,8 @@
name: "ARM64",
pkg: "cmd/internal/obj/arm64",
genfile: "../../arm64/ssa.go",
- ops: ops,
+ genSIMDfile: "../../arm64/simdssa.go",
+ ops: append(ops, simdARM64Ops(fp11, fp21)...),
blocks: blocks,
regnames: regNamesARM64,
ParamIntRegNames: "R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15",
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64.rules b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
new file mode 100644
index 0000000..1f0ecff
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64.rules
@@ -0,0 +1,6 @@
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+(AddFloat32x4 ...) => (VFADD32x4 ...)
+(AddFloat64x2 ...) => (VFADD64x2 ...)
+(AddInt32x4 ...) => (VADD32x4 ...)
+(AddInt64x2 ...) => (VADD64x2 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
new file mode 100644
index 0000000..1d560b1
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/simdARM64ops.go
@@ -0,0 +1,12 @@
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+package main
+
+func simdARM64Ops(v11, v21 regInfo) []opData {
+ return []opData{
+ {name: "VADD32x4", argLength: 2, reg: v21, asm: "VADD", commutative: true, typ: "Vec128", resultInArg0: false},
+ {name: "VADD64x2", argLength: 2, reg: v21, asm: "VADD", commutative: true, typ: "Vec128", resultInArg0: false},
+ {name: "VFADD32x4", argLength: 2, reg: v21, asm: "VFADD", commutative: true, typ: "Vec128", resultInArg0: false},
+ {name: "VFADD64x2", argLength: 2, reg: v21, asm: "VFADD", commutative: true, typ: "Vec128", resultInArg0: false},
+ }
+}
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 0593470..ae5534d 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -3109,6 +3109,10 @@
OpARM64PRFM
OpARM64DMB
OpARM64ZERO
+ OpARM64VADD32x4
+ OpARM64VADD64x2
+ OpARM64VFADD32x4
+ OpARM64VFADD64x2
OpLOONG64NEGV
OpLOONG64NEGF
@@ -46171,6 +46175,66 @@
fixedReg: true,
reg: regInfo{},
},
+ {
+ name: "VADD32x4",
+ argLen: 2,
+ commutative: true,
+ asm: arm64.AVADD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VADD64x2",
+ argLen: 2,
+ commutative: true,
+ asm: arm64.AVADD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VFADD32x4",
+ argLen: 2,
+ commutative: true,
+ asm: arm64.AVFADD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "VFADD64x2",
+ argLen: 2,
+ commutative: true,
+ asm: arm64.AVFADD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
{
name: "NEGV",
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index 6af1558..12da362 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -421,6 +421,18 @@
case OpAdd8:
v.Op = OpARM64ADD
return true
+ case OpAddFloat32x4:
+ v.Op = OpARM64VFADD32x4
+ return true
+ case OpAddFloat64x2:
+ v.Op = OpARM64VFADD64x2
+ return true
+ case OpAddInt32x4:
+ v.Op = OpARM64VADD32x4
+ return true
+ case OpAddInt64x2:
+ v.Op = OpARM64VADD64x2
+ return true
case OpAddPtr:
v.Op = OpARM64ADD
return true
@@ -18259,6 +18271,20 @@
v.AddArg2(ptr, mem)
return true
}
+ // match: (Load <t> ptr mem)
+ // cond: t.Size() == 16
+ // result: (FMOVQload ptr mem)
+ for {
+ t := v.Type
+ ptr := v_0
+ mem := v_1
+ if !(t.Size() == 16) {
+ break
+ }
+ v.reset(OpARM64FMOVQload)
+ v.AddArg2(ptr, mem)
+ return true
+ }
return false
}
func rewriteValueARM64_OpLocalAddr(v *Value) bool {
@@ -21994,6 +22020,21 @@
v.AddArg3(ptr, val, mem)
return true
}
+ // match: (Store {t} ptr val mem)
+ // cond: t.Size() == 16
+ // result: (FMOVQstore ptr val mem)
+ for {
+ t := auxToType(v.Aux)
+ ptr := v_0
+ val := v_1
+ mem := v_2
+ if !(t.Size() == 16) {
+ break
+ }
+ v.reset(OpARM64FMOVQstore)
+ v.AddArg3(ptr, val, mem)
+ return true
+ }
return false
}
func rewriteValueARM64_OpZero(v *Value) bool {
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 4ce329e..8a0647d 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1695,4 +1695,29 @@
addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8FromBits", simdCvtVToMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8.ToBits", simdCvtMaskToV(64, 8), sys.AMD64)
+ // NEON
+ addF(simdPackage, "Float32x4.Add", opLen2(ssa.OpAddFloat32x4, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Float64x2.Add", opLen2(ssa.OpAddFloat64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int32x4.Add", opLen2(ssa.OpAddInt32x4, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Int64x2.Add", opLen2(ssa.OpAddInt64x2, types.TypeVec128), sys.ARM64)
+ addF(simdPackage, "Float32x4.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Float32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Float32x4.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Float64x2.AsFloat32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Float64x2.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Float64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Int32x4.AsFloat32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Int32x4.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Int32x4.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Int64x2.AsFloat32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Int64x2.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "Int64x2.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.ARM64)
+ addF(simdPackage, "LoadFloat32x4", simdLoad(), sys.ARM64)
+ addF(simdPackage, "Float32x4.Store", simdStore(), sys.ARM64)
+ addF(simdPackage, "LoadFloat64x2", simdLoad(), sys.ARM64)
+ addF(simdPackage, "Float64x2.Store", simdStore(), sys.ARM64)
+ addF(simdPackage, "LoadInt32x4", simdLoad(), sys.ARM64)
+ addF(simdPackage, "Int32x4.Store", simdStore(), sys.ARM64)
+ addF(simdPackage, "LoadInt64x2", simdLoad(), sys.ARM64)
+ addF(simdPackage, "Int64x2.Store", simdStore(), sys.ARM64)
}
diff --git a/src/simd/_gen/simdgen/arm64.neon.yaml.toy b/src/simd/_gen/simdgen/arm64.neon.yaml.toy
new file mode 100644
index 0000000..901fcd5
--- /dev/null
+++ b/src/simd/_gen/simdgen/arm64.neon.yaml.toy
@@ -0,0 +1,146 @@
+!sum
+
+# How to generate simd support for an ARM64 NEON subset specified in this file:
+# 1) Run the generator tool with "-arch arm64" option:
+# src/simd/_gen/simdgen% go run . -arch arm64 -o godefs -goroot output arm64.neon.yaml.toy types.yaml
+# 2) Merge manually files shared between amd64 and arm64:
+# src/simd/_gen/simdgen% export C=../../../ ; export O=output/src/
+# export F=cmd/compile/internal/ssagen/simdintrinsics.go ; vimdiff $O/$F $C/$F
+# export F=cmd/compile/internal/ssa/_gen/simdgenericOps.go ; vimdiff $O/$F $C/$F
+# export F=simd/cpu.go ; vimdiff $O/$F $C/$F
+# 3) The rest generated files are arm64-specific and may be copied over:
+# export F=cmd/compile/internal/ssa/_gen/simdARM64ops.go ; cp $O/$F $C/$F
+# export F=cmd/compile/internal/ssa/_gen/simdARM64.rules ; cp $O/$F $C/$F
+# export F=simd/types_arm64.go ; cp $O/$F $C/$F
+# export F=simd/ops_arm64.go ; cp $O/$F $C/$F
+# 4) Ready to build and test:
+# cd WSROOT/src/cmd/compile/internal/ssa && gp generate
+# cd WSROOT/src && ./make.bash
+
+# ARM64 FADD instruction (NEON 4-lane 32-bit floating-point add)
+- go: Add
+ goarch: arm64
+ asm: VFADD
+ arrangement: "32x4"
+ cpuFeature: NEON
+ commutative: true
+ inVariant: []
+ in:
+ - class: vreg
+ go: Float32x4
+ base: float
+ elemBits: 32
+ bits: 128
+ lanes: 4
+ asmPos: 1
+ - class: vreg
+ go: Float32x4
+ base: float
+ elemBits: 32
+ bits: 128
+ lanes: 4
+ asmPos: 2
+ out:
+ - class: vreg
+ go: Float32x4
+ base: float
+ elemBits: 32
+ bits: 128
+ lanes: 4
+ asmPos: 0
+
+# ARM64 FADD instruction (NEON 2-lane 64-bit floating-point add)
+- go: Add
+ goarch: arm64
+ asm: VFADD
+ arrangement: "64x2"
+ cpuFeature: NEON
+ commutative: true
+ inVariant: []
+ in:
+ - class: vreg
+ go: Float64x2
+ base: float
+ elemBits: 64
+ bits: 128
+ lanes: 2
+ asmPos: 1
+ - class: vreg
+ go: Float64x2
+ base: float
+ elemBits: 64
+ bits: 128
+ lanes: 2
+ asmPos: 2
+ out:
+ - class: vreg
+ go: Float64x2
+ base: float
+ elemBits: 64
+ bits: 128
+ lanes: 2
+ asmPos: 0
+
+# ARM64 ADD instruction (NEON 4-lane 32-bit integer add)
+- go: Add
+ goarch: arm64
+ asm: VADD
+ arrangement: "32x4"
+ cpuFeature: NEON
+ commutative: true
+ inVariant: []
+ in:
+ - class: vreg
+ go: Int32x4
+ base: int
+ elemBits: 32
+ bits: 128
+ lanes: 4
+ asmPos: 1
+ - class: vreg
+ go: Int32x4
+ base: int
+ elemBits: 32
+ bits: 128
+ lanes: 4
+ asmPos: 2
+ out:
+ - class: vreg
+ go: Int32x4
+ base: int
+ elemBits: 32
+ bits: 128
+ lanes: 4
+ asmPos: 0
+
+# ARM64 ADD instruction (NEON 2-lane 64-bit integer add)
+- go: Add
+ goarch: arm64
+ asm: VADD
+ arrangement: "64x2"
+ cpuFeature: NEON
+ commutative: true
+ inVariant: []
+ in:
+ - class: vreg
+ go: Int64x2
+ base: int
+ elemBits: 64
+ bits: 128
+ lanes: 2
+ asmPos: 1
+ - class: vreg
+ go: Int64x2
+ base: int
+ elemBits: 64
+ bits: 128
+ lanes: 2
+ asmPos: 2
+ out:
+ - class: vreg
+ go: Int64x2
+ base: int
+ elemBits: 64
+ bits: 128
+ lanes: 2
+ asmPos: 0
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
index 0ef5124..6b5d6aa 100644
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -141,10 +141,13 @@
}
}
- for _, typ := range typesFromTypeMap(typeMap) {
- if typ.MaskedLoadStoreFilter() {
- if err := t.ExecuteTemplate(buffer, "maskedLoadStore", typ); err != nil {
- panic(fmt.Errorf("failed to execute maskedLoadStore template: %w", err))
+ // TODO: Does NEON need masked ops here (it doesn't have native masked load/store)
+ if archInfo.Arch != "arm64" {
+ for _, typ := range typesFromTypeMap(typeMap) {
+ if typ.MaskedLoadStoreFilter() {
+ if err := t.ExecuteTemplate(buffer, "maskedLoadStore", typ); err != nil {
+ panic(fmt.Errorf("failed to execute maskedLoadStore template: %w", err))
+ }
}
}
}
diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go
index cf952fc..ecdd19f 100644
--- a/src/simd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go
@@ -14,7 +14,7 @@
const simdMachineOpsTmpl = `
package main
-func simd{{.ArchUpper}}Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw regInfo) []opData {
+func simd{{.ArchUpper}}Ops({{if eq .ArchUpper "ARM64"}}v11, v21 regInfo{{else}}v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw regInfo{{end}}) []opData {
return []opData{
{{- range .OpsData }}
{name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go
index 8ce348d..d43620d 100644
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -145,6 +145,9 @@
const simdFeaturesTemplate = `
import "internal/cpu"
+// Ensure internal/cpu is marked as used (package may not be used on some targets)
+var _ = cpu.DebugOptions
+
{{range .}}
{{- if eq .Feature "AVX512"}}
// Has{{.Feature}} returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
@@ -496,7 +499,7 @@
}
// writeSIMDTypes generates the simd vector types into a bytes.Buffer
-func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
+func writeSIMDTypes(typeMap simdTypeMap, archInfo ArchInfo) *bytes.Buffer {
t := templateOf(simdTypesTemplates, "types_amd64")
loadStore := templateOf(simdLoadStoreTemplate, "loadstore_amd64")
maskedLoadStore := templateOf(simdMaskedLoadStoreTemplate, "maskedloadstore_amd64")
@@ -533,7 +536,8 @@
panic(fmt.Errorf("failed to execute loadstore template for type %s: %w", typeDef.Name, err))
}
// restrict to AVX2 masked loads/stores first.
- if typeDef.MaskedLoadStoreFilter() {
+ // TODO: Does NEON need masked ops here (it doesn't have native masked load/store).
+ if archInfo.Arch != "arm64" && typeDef.MaskedLoadStoreFilter() {
if err := maskedLoadStore.ExecuteTemplate(buffer, "maskedloadstore_amd64", typeDef); err != nil {
panic(fmt.Errorf("failed to execute maskedloadstore template for type %s: %w", typeDef.Name, err))
}
diff --git a/src/simd/_gen/simdgen/gen_simdssa.go b/src/simd/_gen/simdgen/gen_simdssa.go
index 56aee04..6b0a66e 100644
--- a/src/simd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/_gen/simdgen/gen_simdssa.go
@@ -46,6 +46,10 @@
}
{{end}}
{{define "ending"}}
+ // Ensure p and architecture package are marked as used
+ // (they may not be used in all generated code paths)
+ _ = p
+ _ = {{.ObjArch}}.REG_V0
return true
}
{{end}}`))
@@ -224,8 +228,8 @@
}
}
- if err := ssaTemplates.ExecuteTemplate(buffer, "ending", nil); err != nil {
- panic(fmt.Errorf("failed to execute footer template: %w", err))
+ if err := ssaTemplates.ExecuteTemplate(buffer, "ending", archInfo); err != nil {
+ panic(fmt.Errorf("failed to execute ending template: %w", err))
}
return buffer
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go
index c13e4cd..f3cae95 100644
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -385,7 +385,7 @@
archInfo := CurrentArch()
- formatWriteAndClose(writeSIMDTypes(typeMap), path, fmt.Sprintf("src/%s/types_%s.go", simdPackage, archInfo.Arch))
+ formatWriteAndClose(writeSIMDTypes(typeMap, archInfo), path, fmt.Sprintf("src/%s/types_%s.go", simdPackage, archInfo.Arch))
formatWriteAndClose(writeSIMDFeatures(deduped), path, "src/"+simdPackage+"/cpu.go")
formatWriteAndClose(writeSIMDStubs(deduped, typeMap), path, fmt.Sprintf("src/%s/ops_%s.go", simdPackage, archInfo.Arch))
formatWriteAndClose(writeSIMDIntrinsics(deduped, typeMap), path, "src/cmd/compile/internal/ssagen/simdintrinsics.go")
diff --git a/src/simd/cpu.go b/src/simd/cpu.go
index cbde9a8..ef56b9a 100644
--- a/src/simd/cpu.go
+++ b/src/simd/cpu.go
@@ -90,3 +90,14 @@
func HasAVXVNNI() bool {
return cpu.X86.HasAVXVNNI
}
+
+// Ensure internal/cpu is marked as used (package may not be used on some targets)
+var _ = cpu.DebugOptions
+
+// HasNEON returns whether the CPU supports the NEON feature.
+//
+// HasNEON is defined on all GOARCHes, but will only return true on
+// GOARCH arm64.
+func HasNEON() bool {
+ return true // NEON is mandatory on ARM64
+}
diff --git a/src/simd/dummy_arm64.s b/src/simd/dummy_arm64.s
new file mode 100644
index 0000000..6f73a91
--- /dev/null
+++ b/src/simd/dummy_arm64.s
@@ -0,0 +1,7 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build arm64
+
+// Empty file to allow bodyless functions.
\ No newline at end of file
diff --git a/src/simd/ops_arm64.go b/src/simd/ops_arm64.go
new file mode 100644
index 0000000..4132ec6
--- /dev/null
+++ b/src/simd/ops_arm64.go
@@ -0,0 +1,63 @@
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package simd
+
+/* Add */
+
+// UNDOCUMENTED
+//
+// Asm: VFADD, CPU Feature: NEON
+func (x Float32x4) Add(y Float32x4) Float32x4
+
+// UNDOCUMENTED
+//
+// Asm: VFADD, CPU Feature: NEON
+func (x Float64x2) Add(y Float64x2) Float64x2
+
+// UNDOCUMENTED
+//
+// Asm: VADD, CPU Feature: NEON
+func (x Int32x4) Add(y Int32x4) Int32x4
+
+// UNDOCUMENTED
+//
+// Asm: VADD, CPU Feature: NEON
+func (x Int64x2) Add(y Int64x2) Int64x2
+
+// Float64x2 converts from Float32x4 to Float64x2
+func (from Float32x4) AsFloat64x2() (to Float64x2)
+
+// Int32x4 converts from Float32x4 to Int32x4
+func (from Float32x4) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Float32x4 to Int64x2
+func (from Float32x4) AsInt64x2() (to Int64x2)
+
+// Float32x4 converts from Float64x2 to Float32x4
+func (from Float64x2) AsFloat32x4() (to Float32x4)
+
+// Int32x4 converts from Float64x2 to Int32x4
+func (from Float64x2) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Float64x2 to Int64x2
+func (from Float64x2) AsInt64x2() (to Int64x2)
+
+// Float32x4 converts from Int32x4 to Float32x4
+func (from Int32x4) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Int32x4 to Float64x2
+func (from Int32x4) AsFloat64x2() (to Float64x2)
+
+// Int64x2 converts from Int32x4 to Int64x2
+func (from Int32x4) AsInt64x2() (to Int64x2)
+
+// Float32x4 converts from Int64x2 to Float32x4
+func (from Int64x2) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Int64x2 to Float64x2
+func (from Int64x2) AsFloat64x2() (to Float64x2)
+
+// Int32x4 converts from Int64x2 to Int32x4
+func (from Int64x2) AsInt32x4() (to Int32x4)
diff --git a/src/simd/testneon/sample.go b/src/simd/testneon/sample.go
new file mode 100644
index 0000000..c55c5a8
--- /dev/null
+++ b/src/simd/testneon/sample.go
@@ -0,0 +1,189 @@
+//go:build goexperiment.simd
+
+// NEON SIMD example
+// Run with: GOEXPERIMENT=simd go run sample.go
+package main
+
+import (
+ "fmt"
+ "os"
+ "simd"
+)
+
+//go:noinline
+func testFloat32x4() {
+ fmt.Println("=== Float32x4 Vector Addition ===")
+
+ a := [4]float32{1.0, 2.0, 3.0, 4.0}
+ b := [4]float32{5.0, 6.0, 7.0, 8.0}
+
+ // Load arrays into SIMD vectors
+ va := simd.LoadFloat32x4(&a)
+ vb := simd.LoadFloat32x4(&b)
+
+ // Perform vector addition (all 4 elements at once)
+ result := va.Add(vb)
+
+ // Store result back to array
+ var output [4]float32
+ result.Store(&output)
+
+ fmt.Printf("a: %v\n", a)
+ fmt.Printf("b: %v\n", b)
+ fmt.Printf("a + b: %v\n", output)
+}
+
+//go:noinline
+func testFloat64x2() {
+ fmt.Println("\n=== Float64x2 Vector Addition ===")
+
+ a := [2]float64{10.5, 20.5}
+ b := [2]float64{2.0, 4.0}
+
+ // Load arrays into SIMD vectors
+ va := simd.LoadFloat64x2(&a)
+ vb := simd.LoadFloat64x2(&b)
+
+ // Perform vector addition (all 2 elements at once)
+ result := va.Add(vb)
+
+ // Store result back to array
+ var output [2]float64
+ result.Store(&output)
+
+ fmt.Printf("a: %v\n", a)
+ fmt.Printf("b: %v\n", b)
+ fmt.Printf("a + b: %v\n", output)
+}
+
+//go:noinline
+func testInt32x4() {
+ fmt.Println("\n=== Int32x4 Vector Addition ===")
+
+ a := [4]int32{10, 20, 30, 40}
+ b := [4]int32{5, 6, 7, 8}
+
+ // Load arrays into SIMD vectors
+ va := simd.LoadInt32x4(&a)
+ vb := simd.LoadInt32x4(&b)
+
+ // Perform vector addition (all 4 elements at once)
+ result := va.Add(vb)
+
+ // Store result back to array
+ var output [4]int32
+ result.Store(&output)
+
+ fmt.Printf("a: %v\n", a)
+ fmt.Printf("b: %v\n", b)
+ fmt.Printf("a + b: %v\n", output)
+}
+
+//go:noinline
+func testInt64x2() {
+ fmt.Println("\n=== Int64x2 Vector Addition ===")
+
+ a := [2]int64{100, 200}
+ b := [2]int64{50, 75}
+
+ // Load arrays into SIMD vectors
+ va := simd.LoadInt64x2(&a)
+ vb := simd.LoadInt64x2(&b)
+
+ // Perform vector addition (all 2 elements at once)
+ result := va.Add(vb)
+
+ // Store result back to array
+ var output [2]int64
+ result.Store(&output)
+
+ fmt.Printf("a: %v\n", a)
+ fmt.Printf("b: %v\n", b)
+ fmt.Printf("a + b: %v\n", output)
+}
+
+func main() {
+ testFloat32x4()
+ testFloat64x2()
+ testInt32x4()
+ testInt64x2()
+
+ // Test validation - return non-zero on unexpected results
+ fail := false
+
+ // Test Float32x4
+ a32 := [4]float32{1.0, 2.0, 3.0, 4.0}
+ b32 := [4]float32{5.0, 6.0, 7.0, 8.0}
+ va32 := simd.LoadFloat32x4(&a32)
+ vb32 := simd.LoadFloat32x4(&b32)
+ result32 := va32.Add(vb32)
+ var output32 [4]float32
+ result32.Store(&output32)
+
+ expected32 := [4]float32{6.0, 8.0, 10.0, 12.0}
+ for i := range output32 {
+ if output32[i] != expected32[i] {
+ fmt.Printf("Float32x4 test failed: expected %v, got %v\n", expected32, output32)
+ fail = true
+ break
+ }
+ }
+
+ // Test Float64x2
+ a64 := [2]float64{10.5, 20.5}
+ b64 := [2]float64{2.0, 4.0}
+ va64 := simd.LoadFloat64x2(&a64)
+ vb64 := simd.LoadFloat64x2(&b64)
+ result64 := va64.Add(vb64)
+ var output64 [2]float64
+ result64.Store(&output64)
+
+ expected64 := [2]float64{12.5, 24.5}
+ for i := range output64 {
+ if output64[i] != expected64[i] {
+ fmt.Printf("Float64x2 test failed: expected %v, got %v\n", expected64, output64)
+ fail = true
+ break
+ }
+ }
+
+ // Test Int32x4
+ a_i32 := [4]int32{10, 20, 30, 40}
+ b_i32 := [4]int32{5, 6, 7, 8}
+ va_i32 := simd.LoadInt32x4(&a_i32)
+ vb_i32 := simd.LoadInt32x4(&b_i32)
+ result_i32 := va_i32.Add(vb_i32)
+ var output_i32 [4]int32
+ result_i32.Store(&output_i32)
+
+ expected_i32 := [4]int32{15, 26, 37, 48}
+ for i := range output_i32 {
+ if output_i32[i] != expected_i32[i] {
+ fmt.Printf("Int32x4 test failed: expected %v, got %v\n", expected_i32, output_i32)
+ fail = true
+ break
+ }
+ }
+
+ // Test Int64x2
+ a_i64 := [2]int64{100, 200}
+ b_i64 := [2]int64{50, 75}
+ va_i64 := simd.LoadInt64x2(&a_i64)
+ vb_i64 := simd.LoadInt64x2(&b_i64)
+ result_i64 := va_i64.Add(vb_i64)
+ var output_i64 [2]int64
+ result_i64.Store(&output_i64)
+
+ expected_i64 := [2]int64{150, 275}
+ for i := range output_i64 {
+ if output_i64[i] != expected_i64[i] {
+ fmt.Printf("Int64x2 test failed: expected %v, got %v\n", expected_i64, output_i64)
+ fail = true
+ break
+ }
+ }
+
+ if fail {
+ os.Exit(1)
+ }
+}
diff --git a/src/simd/types_arm64.go b/src/simd/types_arm64.go
new file mode 100644
index 0000000..5f6a398
--- /dev/null
+++ b/src/simd/types_arm64.go
@@ -0,0 +1,86 @@
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package simd
+
+// v128 is a tag type that tells the compiler that this is really 128-bit SIMD
+type v128 struct {
+ _128 struct{}
+}
+
+// Float32x4 is a 128-bit SIMD vector of 4 float32
+type Float32x4 struct {
+ float32x4 v128
+ vals [4]float32
+}
+
+// Len returns the number of elements in a Float32x4
+func (x Float32x4) Len() int { return 4 }
+
+// LoadFloat32x4 loads a Float32x4 from an array
+//
+//go:noescape
+func LoadFloat32x4(y *[4]float32) Float32x4
+
+// Store stores a Float32x4 to an array
+//
+//go:noescape
+func (x Float32x4) Store(y *[4]float32)
+
+// Float64x2 is a 128-bit SIMD vector of 2 float64
+type Float64x2 struct {
+ float64x2 v128
+ vals [2]float64
+}
+
+// Len returns the number of elements in a Float64x2
+func (x Float64x2) Len() int { return 2 }
+
+// LoadFloat64x2 loads a Float64x2 from an array
+//
+//go:noescape
+func LoadFloat64x2(y *[2]float64) Float64x2
+
+// Store stores a Float64x2 to an array
+//
+//go:noescape
+func (x Float64x2) Store(y *[2]float64)
+
+// Int32x4 is a 128-bit SIMD vector of 4 int32
+type Int32x4 struct {
+ int32x4 v128
+ vals [4]int32
+}
+
+// Len returns the number of elements in a Int32x4
+func (x Int32x4) Len() int { return 4 }
+
+// LoadInt32x4 loads a Int32x4 from an array
+//
+//go:noescape
+func LoadInt32x4(y *[4]int32) Int32x4
+
+// Store stores a Int32x4 to an array
+//
+//go:noescape
+func (x Int32x4) Store(y *[4]int32)
+
+// Int64x2 is a 128-bit SIMD vector of 2 int64
+type Int64x2 struct {
+ int64x2 v128
+ vals [2]int64
+}
+
+// Len returns the number of elements in a Int64x2
+func (x Int64x2) Len() int { return 2 }
+
+// LoadInt64x2 loads a Int64x2 from an array
+//
+//go:noescape
+func LoadInt64x2(y *[2]int64) Int64x2
+
+// Store stores a Int64x2 to an array
+//
+//go:noescape
+func (x Int64x2) Store(y *[2]int64)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Thanks for working on this!
I just wanted to warn you that there's definitely some cleanup needed in simdgen before it's really ready for more than one architecture. I just want to make sure you don't dig yourself into a hole in simdgen.
A significant known issue is that some x86-isms have crept into the categories.yaml files. Those should really be shared across architectures. A lot of it's already clean, but at least the comparison operations definitely aren't and there are probably some others.
We'd also like to drive the instruction input to simdgen from the AARCHMRS (ARM Architecture Machine Readable Specification), just like we use XED for Intel.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Thanks for working on this!
I just wanted to warn you that there's definitely some cleanup needed in simdgen before it's really ready for more than one architecture. I just want to make sure you don't dig yourself into a hole in simdgen.
A significant known issue is that some x86-isms have crept into the categories.yaml files. Those should really be shared across architectures. A lot of it's already clean, but at least the comparison operations definitely aren't and there are probably some others.
We'd also like to drive the instruction input to simdgen from the AARCHMRS (ARM Architecture Machine Readable Specification), just like we use XED for Intel.
Thanks for the heads up! For now, I'm focusing on exploring different ARM64 register arrangements and helpers for the ssa->Prog translation in arm64/ssa.go (just added in subsequent CLs), so I've deliberately skipped using the categories for the moment to concentrate on that. These changes mostly affect adding Arrangement operation field and some other helpers (e.g. to allow different encoding for some instructions) and representing it in simdgen.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Hi Alexander — thanks for all the work on this chain. I've been working on ARM64 SIMD support independently (on a local branch off dev.simd) and wanted to reach out about coordinating.
Re Austin's comment about x86-isms in the categories — I hit the same issues when mapping the API surface to ARM64 NEON. The main areas that will need attention are comparisons (constImm encodes x86-specific immediate predicates, and the signed/unsigned split works differently on ARM64), and mask representation (NEON masks are full vectors rather than compact bitmasks, which makes operations like ToBits/FromBits multi-instruction sequences — worth considering alongside SVE predicates for the mask abstraction design).
I've been testing your XML parser and it handles the NEON instruction set well. I'd like to help expand the ARM64 category coverage — I could contribute category files for BitwiseLogic, MinMax, Compares, and other operation groups in your arm64/ops/ format.
I also noticed that gen_simdMachineOps.go doesn't have a register constraint template for unary operations (v11 — 1 input, 1 output) on arm64, which blocks categories like IntOnlyArith (Abs, Neg, CLZ, CNT) and FPonlyArith (Sqrt, Abs, Neg). I'd be happy to work on adding that support as well.
Would it be useful to coordinate?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Hi Alexander — thanks for all the work on this chain. I've been working on ARM64 SIMD support independently (on a local branch off dev.simd) and wanted to reach out about coordinating.
Re Austin's comment about x86-isms in the categories — I hit the same issues when mapping the API surface to ARM64 NEON. The main areas that will need attention are comparisons (constImm encodes x86-specific immediate predicates, and the signed/unsigned split works differently on ARM64), and mask representation (NEON masks are full vectors rather than compact bitmasks, which makes operations like ToBits/FromBits multi-instruction sequences — worth considering alongside SVE predicates for the mask abstraction design).
I've been testing your XML parser and it handles the NEON instruction set well. I'd like to help expand the ARM64 category coverage — I could contribute category files for BitwiseLogic, MinMax, Compares, and other operation groups in your arm64/ops/ format.
I also noticed that gen_simdMachineOps.go doesn't have a register constraint template for unary operations (v11 — 1 input, 1 output) on arm64, which blocks categories like IntOnlyArith (Abs, Neg, CLZ, CNT) and FPonlyArith (Sqrt, Abs, Neg). I'd be happy to work on adding that support as well.
Would it be useful to coordinate?
Thanks — yes, let's coordinate! A few things to be aware of:
Most of this chain is not yet reviewed, so the base may change. This CL itself seems mostly ready for review, but nothing below it is settled yet.
On my end, the short-term priority is a change to support bounded immediates (replacing the current up-to-255 encoding) — arm64 instructions won't assemble with out-of-range immediates, and this blocks a lot of categories. After that I'm working on GetElem/SetElem — I plan to reuse some slice-parts code from @drc...@google.com 's WIP wasm SIMD support (CL 745080) to test arm64 GetElem/SetElem for integer and FP NEON vector types. That will be a follow-up CL.
Adding @drc...@google.com , @shaoj...@google.com — they'll need to review this work.
Re the v11 register constraint template — in the current version of this CL it should be enough to add entries into `arch.go`; `arm64/ssa.go` already has a `simdV11` function that gets used underneath. Category file contributions for BitwiseLogic, MinMax, Compares etc. in arm64/ops/ format also sound great — just expect some churn underneath until the earlier CLs are reviewed.
On the comparison/mask design — for NEON we should try to stay close to wasm SIMD128 where possible, since wasm comparisons also use full-vector masks (all-ones per lane for true, all-zeros for false) in regular v128 registers, same as NEON. That said, the broader mask abstraction (especially looking ahead to SVE predicates) will still need discussion.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Alexander MusmanHi Alexander — thanks for all the work on this chain. I've been working on ARM64 SIMD support independently (on a local branch off dev.simd) and wanted to reach out about coordinating.
Re Austin's comment about x86-isms in the categories — I hit the same issues when mapping the API surface to ARM64 NEON. The main areas that will need attention are comparisons (constImm encodes x86-specific immediate predicates, and the signed/unsigned split works differently on ARM64), and mask representation (NEON masks are full vectors rather than compact bitmasks, which makes operations like ToBits/FromBits multi-instruction sequences — worth considering alongside SVE predicates for the mask abstraction design).
I've been testing your XML parser and it handles the NEON instruction set well. I'd like to help expand the ARM64 category coverage — I could contribute category files for BitwiseLogic, MinMax, Compares, and other operation groups in your arm64/ops/ format.
I also noticed that gen_simdMachineOps.go doesn't have a register constraint template for unary operations (v11 — 1 input, 1 output) on arm64, which blocks categories like IntOnlyArith (Abs, Neg, CLZ, CNT) and FPonlyArith (Sqrt, Abs, Neg). I'd be happy to work on adding that support as well.
Would it be useful to coordinate?
Thanks — yes, let's coordinate! A few things to be aware of:
Most of this chain is not yet reviewed, so the base may change. This CL itself seems mostly ready for review, but nothing below it is settled yet.
On my end, the short-term priority is a change to support bounded immediates (replacing the current up-to-255 encoding) — arm64 instructions won't assemble with out-of-range immediates, and this blocks a lot of categories. After that I'm working on GetElem/SetElem — I plan to reuse some slice-parts code from @drc...@google.com 's WIP wasm SIMD support (CL 745080) to test arm64 GetElem/SetElem for integer and FP NEON vector types. That will be a follow-up CL.
Adding @drc...@google.com , @shaoj...@google.com — they'll need to review this work.
Re the v11 register constraint template — in the current version of this CL it should be enough to add entries into `arch.go`; `arm64/ssa.go` already has a `simdV11` function that gets used underneath. Category file contributions for BitwiseLogic, MinMax, Compares etc. in arm64/ops/ format also sound great — just expect some churn underneath until the earlier CLs are reviewed.
On the comparison/mask design — for NEON we should try to stay close to wasm SIMD128 where possible, since wasm comparisons also use full-vector masks (all-ones per lane for true, all-zeros for false) in regular v128 registers, same as NEON. That said, the broader mask abstraction (especially looking ahead to SVE predicates) will still need discussion.
Thanks for the quick response! I will finish preparing a fix for the v11 issue and submit a CL in the next day or two.
I also have some work to address the concerns that Austin raised. That work is independent so I'll try to get that posted soon too, so we can begin getting things reviewed in parallel.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
I don't think we want to create an "arm64" directory. This would create asymmetry between amd64 and arm64.
Also, some of the existing YAML files are meant to be portable, used across architectures. The "categories.yaml" are one of them. (They may list operations that do not unify on some architectures, and that is fine.) So we don't want to create arm64/.../categories.yaml files.
go.yaml's are currently arch-specific. We could consider renaming them go_amd64.yaml, and add go_arm64.yaml. Or we could consider adding "goarch: amd64" entries to the current go.yaml, and then we can add arm64 entries to the same file.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Alexander MusmanHi Alexander — thanks for all the work on this chain. I've been working on ARM64 SIMD support independently (on a local branch off dev.simd) and wanted to reach out about coordinating.
Re Austin's comment about x86-isms in the categories — I hit the same issues when mapping the API surface to ARM64 NEON. The main areas that will need attention are comparisons (constImm encodes x86-specific immediate predicates, and the signed/unsigned split works differently on ARM64), and mask representation (NEON masks are full vectors rather than compact bitmasks, which makes operations like ToBits/FromBits multi-instruction sequences — worth considering alongside SVE predicates for the mask abstraction design).
I've been testing your XML parser and it handles the NEON instruction set well. I'd like to help expand the ARM64 category coverage — I could contribute category files for BitwiseLogic, MinMax, Compares, and other operation groups in your arm64/ops/ format.
I also noticed that gen_simdMachineOps.go doesn't have a register constraint template for unary operations (v11 — 1 input, 1 output) on arm64, which blocks categories like IntOnlyArith (Abs, Neg, CLZ, CNT) and FPonlyArith (Sqrt, Abs, Neg). I'd be happy to work on adding that support as well.
Would it be useful to coordinate?
Jonathan SwinneyThanks — yes, let's coordinate! A few things to be aware of:
Most of this chain is not yet reviewed, so the base may change. This CL itself seems mostly ready for review, but nothing below it is settled yet.
On my end, the short-term priority is a change to support bounded immediates (replacing the current up-to-255 encoding) — arm64 instructions won't assemble with out-of-range immediates, and this blocks a lot of categories. After that I'm working on GetElem/SetElem — I plan to reuse some slice-parts code from @drc...@google.com 's WIP wasm SIMD support (CL 745080) to test arm64 GetElem/SetElem for integer and FP NEON vector types. That will be a follow-up CL.
Adding @drc...@google.com , @shaoj...@google.com — they'll need to review this work.
Re the v11 register constraint template — in the current version of this CL it should be enough to add entries into `arch.go`; `arm64/ssa.go` already has a `simdV11` function that gets used underneath. Category file contributions for BitwiseLogic, MinMax, Compares etc. in arm64/ops/ format also sound great — just expect some churn underneath until the earlier CLs are reviewed.
On the comparison/mask design — for NEON we should try to stay close to wasm SIMD128 where possible, since wasm comparisons also use full-vector masks (all-ones per lane for true, all-zeros for false) in regular v128 registers, same as NEON. That said, the broader mask abstraction (especially looking ahead to SVE predicates) will still need discussion.
Thanks for the quick response! I will finish preparing a fix for the v11 issue and submit a CL in the next day or two.
I also have some work to address the concerns that Austin raised. That work is independent so I'll try to get that posted soon too, so we can begin getting things reviewed in parallel.
Acknowledged
I don't think we want to create an "arm64" directory. This would create asymmetry between amd64 and arm64.
Also, some of the existing YAML files are meant to be portable, used across architectures. The "categories.yaml" are one of them. (They may list operations that do not unify on some architectures, and that is fine.) So we don't want to create arm64/.../categories.yaml files.
go.yaml's are currently arch-specific. We could consider renaming them go_amd64.yaml, and add go_arm64.yaml. Or we could consider adding "goarch: amd64" entries to the current go.yaml, and then we can add arm64 entries to the same file.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +2 |
| Commit-Queue | +1 |
I'll kick the TryBots again, just to see if it works.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| TryBot-Bypass | +1 |
Bypassing the trybots because that failure is not this.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
I've rebased and resolved conflicts on this CL and the next one, CL 773240 (simd: reorganize simdgen YAML configs by architecture) — both should be ready to merge now (would appreciate a quick look at the conflict resolution). The rest of the chain is in progress; I'll push updates as I work through them.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
Thank you for the rebase, I am going to have a shot at getting this all in. I'm a little worried about recent churn in simd for amd64, I guess we'll see.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Auto-Submit | +1 |
| Code-Review | +2 |
LGTM
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Sorry about the conflict, it's in gen_simdIntrinsics.go, roughly
`1,$s/Foo/TypeDotMethod/g`
| Code-Review | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |