[go/dev.simd] [dev.simd] simd, cmd/compile: unexport broadcast1ToN

2 views
Skip to first unread message

Junyang Shao (Gerrit)

unread,
May 8, 2026, 4:42:42 PM (3 days ago) May 8
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Junyang Shao has uploaded the change for review

Commit message

[dev.simd] simd, cmd/compile: unexport broadcast1ToN

For unexported methods, the Masked form is also generated into the API.
Since they are unexported, this CL didn't try removing them, even though
they have no references. But we might want them in the future!

For #78979.
Change-Id: Ied1b159a68b54785254b35ea61145dfdc27832b6

Change diff

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index f94b299..7659601 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -25,23 +25,6 @@
ssa.OpAMD64VPABSQ128,
ssa.OpAMD64VPABSQ256,
ssa.OpAMD64VPABSQ512,
- ssa.OpAMD64VPBROADCASTQ128,
- ssa.OpAMD64VBROADCASTSS128,
- ssa.OpAMD64VBROADCASTSD256,
- ssa.OpAMD64VPBROADCASTD128,
- ssa.OpAMD64VPBROADCASTQ256,
- ssa.OpAMD64VBROADCASTSS256,
- ssa.OpAMD64VBROADCASTSD512,
- ssa.OpAMD64VPBROADCASTW128,
- ssa.OpAMD64VPBROADCASTD256,
- ssa.OpAMD64VPBROADCASTQ512,
- ssa.OpAMD64VBROADCASTSS512,
- ssa.OpAMD64VPBROADCASTB128,
- ssa.OpAMD64VPBROADCASTW256,
- ssa.OpAMD64VPBROADCASTD512,
- ssa.OpAMD64VPBROADCASTB256,
- ssa.OpAMD64VPBROADCASTW512,
- ssa.OpAMD64VPBROADCASTB512,
ssa.OpAMD64VCVTPD2PSX128,
ssa.OpAMD64VCVTPD2PSY128,
ssa.OpAMD64VCVTPD2PS256,
@@ -216,7 +199,24 @@
ssa.OpAMD64VPMOVQW128_512,
ssa.OpAMD64VPMOVQD128_128,
ssa.OpAMD64VPMOVQD128_256,
- ssa.OpAMD64VPMOVQD256:
+ ssa.OpAMD64VPMOVQD256,
+ ssa.OpAMD64VPBROADCASTQ128,
+ ssa.OpAMD64VBROADCASTSS128,
+ ssa.OpAMD64VBROADCASTSD256,
+ ssa.OpAMD64VPBROADCASTD128,
+ ssa.OpAMD64VPBROADCASTQ256,
+ ssa.OpAMD64VBROADCASTSS256,
+ ssa.OpAMD64VBROADCASTSD512,
+ ssa.OpAMD64VPBROADCASTW128,
+ ssa.OpAMD64VPBROADCASTD256,
+ ssa.OpAMD64VPBROADCASTQ512,
+ ssa.OpAMD64VBROADCASTSS512,
+ ssa.OpAMD64VPBROADCASTB128,
+ ssa.OpAMD64VPBROADCASTW256,
+ ssa.OpAMD64VPBROADCASTD512,
+ ssa.OpAMD64VPBROADCASTB256,
+ ssa.OpAMD64VPBROADCASTW512,
+ ssa.OpAMD64VPBROADCASTB512:
p = simdV11(s, v)

case ssa.OpAMD64VAESDECLAST128,
@@ -832,23 +832,6 @@
ssa.OpAMD64VPABSQMasked128,
ssa.OpAMD64VPABSQMasked256,
ssa.OpAMD64VPABSQMasked512,
- ssa.OpAMD64VPBROADCASTQMasked128,
- ssa.OpAMD64VBROADCASTSSMasked128,
- ssa.OpAMD64VBROADCASTSDMasked256,
- ssa.OpAMD64VPBROADCASTDMasked128,
- ssa.OpAMD64VPBROADCASTQMasked256,
- ssa.OpAMD64VBROADCASTSSMasked256,
- ssa.OpAMD64VBROADCASTSDMasked512,
- ssa.OpAMD64VPBROADCASTWMasked128,
- ssa.OpAMD64VPBROADCASTDMasked256,
- ssa.OpAMD64VPBROADCASTQMasked512,
- ssa.OpAMD64VBROADCASTSSMasked512,
- ssa.OpAMD64VPBROADCASTBMasked128,
- ssa.OpAMD64VPBROADCASTWMasked256,
- ssa.OpAMD64VPBROADCASTDMasked512,
- ssa.OpAMD64VPBROADCASTBMasked256,
- ssa.OpAMD64VPBROADCASTWMasked512,
- ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VCOMPRESSPSMasked128,
ssa.OpAMD64VCOMPRESSPSMasked256,
ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -1060,6 +1043,23 @@
ssa.OpAMD64VPMOVQDMasked128_128,
ssa.OpAMD64VPMOVQDMasked128_256,
ssa.OpAMD64VPMOVQDMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked128,
+ ssa.OpAMD64VBROADCASTSSMasked128,
+ ssa.OpAMD64VBROADCASTSDMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked128,
+ ssa.OpAMD64VPBROADCASTQMasked256,
+ ssa.OpAMD64VBROADCASTSSMasked256,
+ ssa.OpAMD64VBROADCASTSDMasked512,
+ ssa.OpAMD64VPBROADCASTWMasked128,
+ ssa.OpAMD64VPBROADCASTDMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked512,
+ ssa.OpAMD64VBROADCASTSSMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked128,
+ ssa.OpAMD64VPBROADCASTWMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked256,
+ ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VMOVDQU8Masked128,
ssa.OpAMD64VMOVDQU8Masked256,
ssa.OpAMD64VMOVDQU8Masked512,
@@ -2448,23 +2448,6 @@
ssa.OpAMD64VPABSQMasked128Merging,
ssa.OpAMD64VPABSQMasked256Merging,
ssa.OpAMD64VPABSQMasked512Merging,
- ssa.OpAMD64VPBROADCASTQMasked128Merging,
- ssa.OpAMD64VBROADCASTSSMasked128Merging,
- ssa.OpAMD64VBROADCASTSDMasked256Merging,
- ssa.OpAMD64VPBROADCASTDMasked128Merging,
- ssa.OpAMD64VPBROADCASTQMasked256Merging,
- ssa.OpAMD64VBROADCASTSSMasked256Merging,
- ssa.OpAMD64VBROADCASTSDMasked512Merging,
- ssa.OpAMD64VPBROADCASTWMasked128Merging,
- ssa.OpAMD64VPBROADCASTDMasked256Merging,
- ssa.OpAMD64VPBROADCASTQMasked512Merging,
- ssa.OpAMD64VBROADCASTSSMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked128Merging,
- ssa.OpAMD64VPBROADCASTWMasked256Merging,
- ssa.OpAMD64VPBROADCASTDMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked256Merging,
- ssa.OpAMD64VPBROADCASTWMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked512Merging,
ssa.OpAMD64VRNDSCALEPSMasked128Merging,
ssa.OpAMD64VRNDSCALEPSMasked256Merging,
ssa.OpAMD64VRNDSCALEPSMasked512Merging,
@@ -2664,6 +2647,23 @@
ssa.OpAMD64VPMOVQDMasked128_128Merging,
ssa.OpAMD64VPMOVQDMasked128_256Merging,
ssa.OpAMD64VPMOVQDMasked256Merging,
+ ssa.OpAMD64VPBROADCASTQMasked128Merging,
+ ssa.OpAMD64VBROADCASTSSMasked128Merging,
+ ssa.OpAMD64VBROADCASTSDMasked256Merging,
+ ssa.OpAMD64VPBROADCASTDMasked128Merging,
+ ssa.OpAMD64VPBROADCASTQMasked256Merging,
+ ssa.OpAMD64VBROADCASTSSMasked256Merging,
+ ssa.OpAMD64VBROADCASTSDMasked512Merging,
+ ssa.OpAMD64VPBROADCASTWMasked128Merging,
+ ssa.OpAMD64VPBROADCASTDMasked256Merging,
+ ssa.OpAMD64VPBROADCASTQMasked512Merging,
+ ssa.OpAMD64VBROADCASTSSMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked128Merging,
+ ssa.OpAMD64VPBROADCASTWMasked256Merging,
+ ssa.OpAMD64VPBROADCASTDMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked256Merging,
+ ssa.OpAMD64VPBROADCASTWMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked512Merging,
ssa.OpAMD64VPSHUFDMasked256Merging,
ssa.OpAMD64VPSHUFDMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked256Merging,
@@ -2805,23 +2805,6 @@
ssa.OpAMD64VPAVGWMasked128,
ssa.OpAMD64VPAVGWMasked256,
ssa.OpAMD64VPAVGWMasked512,
- ssa.OpAMD64VPBROADCASTQMasked128,
- ssa.OpAMD64VBROADCASTSSMasked128,
- ssa.OpAMD64VBROADCASTSDMasked256,
- ssa.OpAMD64VPBROADCASTDMasked128,
- ssa.OpAMD64VPBROADCASTQMasked256,
- ssa.OpAMD64VBROADCASTSSMasked256,
- ssa.OpAMD64VBROADCASTSDMasked512,
- ssa.OpAMD64VPBROADCASTWMasked128,
- ssa.OpAMD64VPBROADCASTDMasked256,
- ssa.OpAMD64VPBROADCASTQMasked512,
- ssa.OpAMD64VBROADCASTSSMasked512,
- ssa.OpAMD64VPBROADCASTBMasked128,
- ssa.OpAMD64VPBROADCASTWMasked256,
- ssa.OpAMD64VPBROADCASTDMasked512,
- ssa.OpAMD64VPBROADCASTBMasked256,
- ssa.OpAMD64VPBROADCASTWMasked512,
- ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VRNDSCALEPSMasked128,
ssa.OpAMD64VRNDSCALEPSMasked128load,
ssa.OpAMD64VRNDSCALEPSMasked256,
@@ -3662,6 +3645,23 @@
ssa.OpAMD64VPXORQMasked256load,
ssa.OpAMD64VPXORQMasked512,
ssa.OpAMD64VPXORQMasked512load,
+ ssa.OpAMD64VPBROADCASTQMasked128,
+ ssa.OpAMD64VBROADCASTSSMasked128,
+ ssa.OpAMD64VBROADCASTSDMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked128,
+ ssa.OpAMD64VPBROADCASTQMasked256,
+ ssa.OpAMD64VBROADCASTSSMasked256,
+ ssa.OpAMD64VBROADCASTSDMasked512,
+ ssa.OpAMD64VPBROADCASTWMasked128,
+ ssa.OpAMD64VPBROADCASTDMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked512,
+ ssa.OpAMD64VBROADCASTSSMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked128,
+ ssa.OpAMD64VPBROADCASTWMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked256,
+ ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VMOVDQU8Masked128,
ssa.OpAMD64VMOVDQU8Masked256,
ssa.OpAMD64VMOVDQU8Masked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 716668b..2e295d9 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -140,36 +140,6 @@
(AverageUint16x8 ...) => (VPAVGW128 ...)
(AverageUint16x16 ...) => (VPAVGW256 ...)
(AverageUint16x32 ...) => (VPAVGW512 ...)
-(Broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...)
-(Broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...)
-(Broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...)
-(Broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...)
-(Broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...)
-(Broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...)
-(Broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...)
-(Broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...)
-(Broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...)
(CeilFloat32x4 x) => (VROUNDPS128 [2] x)
(CeilFloat32x8 x) => (VROUNDPS256 [2] x)
(CeilFloat64x2 x) => (VROUNDPD128 [2] x)
@@ -1339,6 +1309,66 @@
(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...)
+(broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...)
+(broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...)
+(broadcast1To2MaskedFloat64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To2MaskedInt64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To2MaskedUint64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...)
+(broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...)
+(broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...)
+(broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...)
+(broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...)
+(broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...)
+(broadcast1To4MaskedFloat32x4 x mask) => (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To4MaskedFloat64x2 x mask) => (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To4MaskedInt32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To4MaskedInt64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To4MaskedUint32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To4MaskedUint64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...)
+(broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...)
+(broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...)
+(broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...)
+(broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...)
+(broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...)
+(broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...)
+(broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(broadcast1To8MaskedFloat32x4 x mask) => (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To8MaskedFloat64x2 x mask) => (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To8MaskedInt16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To8MaskedInt32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To8MaskedInt64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To8MaskedUint16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To8MaskedUint32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To8MaskedUint64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...)
+(broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...)
+(broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...)
+(broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...)
+(broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...)
+(broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...)
+(broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...)
+(broadcast1To16MaskedFloat32x4 x mask) => (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To16MaskedInt8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To16MaskedInt16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To16MaskedInt32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To16MaskedUint8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To16MaskedUint16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To16MaskedUint32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...)
+(broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...)
+(broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...)
+(broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...)
+(broadcast1To32MaskedInt8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To32MaskedInt16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To32MaskedUint8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To32MaskedUint16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...)
+(broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...)
+(broadcast1To64MaskedInt8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To64MaskedUint8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...)
(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...)
(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...)
@@ -1496,23 +1526,6 @@
(VMOVDQU16Masked128 (VPAVGW128 x y) mask) => (VPAVGWMasked128 x y mask)
(VMOVDQU16Masked256 (VPAVGW256 x y) mask) => (VPAVGWMasked256 x y mask)
(VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask)
-(VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask) => (VPBROADCASTQMasked128 x mask)
-(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask)
-(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask)
-(VMOVDQU32Masked128 (VPBROADCASTD128 x) mask) => (VPBROADCASTDMasked128 x mask)
-(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask)
-(VMOVDQU32Masked256 (VBROADCASTSS256 x) mask) => (VBROADCASTSSMasked256 x mask)
-(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask)
-(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask)
-(VMOVDQU32Masked256 (VPBROADCASTD256 x) mask) => (VPBROADCASTDMasked256 x mask)
-(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask)
-(VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask)
-(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask)
-(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask)
-(VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask)
-(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask)
-(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask)
-(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask)
(VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask) => (VRNDSCALEPSMasked128 [a] x mask)
(VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask) => (VRNDSCALEPSMasked256 [a] x mask)
(VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index e076b06..44b9283 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -143,36 +143,6 @@
{name: "AverageUint16x8", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AverageUint16x16", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AverageUint16x32", argLength: 2, commutative: true}, // ARCH:amd64
- {name: "Broadcast1To2Float64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To2Int64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To2Uint64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Float32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Float64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Int32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Int64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Uint32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Uint64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Float32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Float64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Int16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Int32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Int64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Uint16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Uint32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Uint64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Float32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Int8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Int16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Int32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Uint8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Uint16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Uint32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Int8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Int16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Uint8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Uint16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To64Int8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To64Uint8x16", argLength: 1}, // ARCH:amd64
{name: "CeilFloat32x4", argLength: 1}, // ARCH:amd64
{name: "CeilFloat32x8", argLength: 1}, // ARCH:amd64
{name: "CeilFloat64x2", argLength: 1}, // ARCH:amd64
@@ -1140,6 +1110,66 @@
{name: "blendMaskedInt16x32", argLength: 3}, // ARCH:amd64
{name: "blendMaskedInt32x16", argLength: 3}, // ARCH:amd64
{name: "blendMaskedInt64x8", argLength: 3}, // ARCH:amd64
+ {name: "broadcast1To2Float64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To2Int64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To2MaskedFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To2MaskedInt64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To2MaskedUint64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To2Uint64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Float32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Float64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Int32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Int64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4MaskedFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedInt64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedUint32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedUint64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4Uint32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Uint64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Float32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Float64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Int16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Int32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Int64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8MaskedFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedInt64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedUint32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedUint64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8Uint16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Uint32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Uint64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Float32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Int8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Int16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Int32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16MaskedFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedInt8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedUint8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedUint32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16Uint8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Uint16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Uint32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32Int8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32Int16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32MaskedInt8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32MaskedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32MaskedUint8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32MaskedUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32Uint8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32Uint16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To64Int8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To64MaskedInt8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To64MaskedUint8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To64Uint8x16", argLength: 1}, // ARCH:amd64
{name: "AESRoundKeyGenAssistUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "CeilScaledFloat32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "CeilScaledFloat32x8", argLength: 1, aux: "UInt8"}, // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 4e52781..50e7df3 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6345,36 +6345,6 @@
OpAverageUint16x8
OpAverageUint16x16
OpAverageUint16x32
- OpBroadcast1To2Float64x2
- OpBroadcast1To2Int64x2
- OpBroadcast1To2Uint64x2
- OpBroadcast1To4Float32x4
- OpBroadcast1To4Float64x2
- OpBroadcast1To4Int32x4
- OpBroadcast1To4Int64x2
- OpBroadcast1To4Uint32x4
- OpBroadcast1To4Uint64x2
- OpBroadcast1To8Float32x4
- OpBroadcast1To8Float64x2
- OpBroadcast1To8Int16x8
- OpBroadcast1To8Int32x4
- OpBroadcast1To8Int64x2
- OpBroadcast1To8Uint16x8
- OpBroadcast1To8Uint32x4
- OpBroadcast1To8Uint64x2
- OpBroadcast1To16Float32x4
- OpBroadcast1To16Int8x16
- OpBroadcast1To16Int16x8
- OpBroadcast1To16Int32x4
- OpBroadcast1To16Uint8x16
- OpBroadcast1To16Uint16x8
- OpBroadcast1To16Uint32x4
- OpBroadcast1To32Int8x16
- OpBroadcast1To32Int16x8
- OpBroadcast1To32Uint8x16
- OpBroadcast1To32Uint16x8
- OpBroadcast1To64Int8x16
- OpBroadcast1To64Uint8x16
OpCeilFloat32x4
OpCeilFloat32x8
OpCeilFloat64x2
@@ -7342,6 +7312,66 @@
OpblendMaskedInt16x32
OpblendMaskedInt32x16
OpblendMaskedInt64x8
+ Opbroadcast1To2Float64x2
+ Opbroadcast1To2Int64x2
+ Opbroadcast1To2MaskedFloat64x2
+ Opbroadcast1To2MaskedInt64x2
+ Opbroadcast1To2MaskedUint64x2
+ Opbroadcast1To2Uint64x2
+ Opbroadcast1To4Float32x4
+ Opbroadcast1To4Float64x2
+ Opbroadcast1To4Int32x4
+ Opbroadcast1To4Int64x2
+ Opbroadcast1To4MaskedFloat32x4
+ Opbroadcast1To4MaskedFloat64x2
+ Opbroadcast1To4MaskedInt32x4
+ Opbroadcast1To4MaskedInt64x2
+ Opbroadcast1To4MaskedUint32x4
+ Opbroadcast1To4MaskedUint64x2
+ Opbroadcast1To4Uint32x4
+ Opbroadcast1To4Uint64x2
+ Opbroadcast1To8Float32x4
+ Opbroadcast1To8Float64x2
+ Opbroadcast1To8Int16x8
+ Opbroadcast1To8Int32x4
+ Opbroadcast1To8Int64x2
+ Opbroadcast1To8MaskedFloat32x4
+ Opbroadcast1To8MaskedFloat64x2
+ Opbroadcast1To8MaskedInt16x8
+ Opbroadcast1To8MaskedInt32x4
+ Opbroadcast1To8MaskedInt64x2
+ Opbroadcast1To8MaskedUint16x8
+ Opbroadcast1To8MaskedUint32x4
+ Opbroadcast1To8MaskedUint64x2
+ Opbroadcast1To8Uint16x8
+ Opbroadcast1To8Uint32x4
+ Opbroadcast1To8Uint64x2
+ Opbroadcast1To16Float32x4
+ Opbroadcast1To16Int8x16
+ Opbroadcast1To16Int16x8
+ Opbroadcast1To16Int32x4
+ Opbroadcast1To16MaskedFloat32x4
+ Opbroadcast1To16MaskedInt8x16
+ Opbroadcast1To16MaskedInt16x8
+ Opbroadcast1To16MaskedInt32x4
+ Opbroadcast1To16MaskedUint8x16
+ Opbroadcast1To16MaskedUint16x8
+ Opbroadcast1To16MaskedUint32x4
+ Opbroadcast1To16Uint8x16
+ Opbroadcast1To16Uint16x8
+ Opbroadcast1To16Uint32x4
+ Opbroadcast1To32Int8x16
+ Opbroadcast1To32Int16x8
+ Opbroadcast1To32MaskedInt8x16
+ Opbroadcast1To32MaskedInt16x8
+ Opbroadcast1To32MaskedUint8x16
+ Opbroadcast1To32MaskedUint16x8
+ Opbroadcast1To32Uint8x16
+ Opbroadcast1To32Uint16x8
+ Opbroadcast1To64Int8x16
+ Opbroadcast1To64MaskedInt8x16
+ Opbroadcast1To64MaskedUint8x16
+ Opbroadcast1To64Uint8x16
OpAESRoundKeyGenAssistUint32x4
OpCeilScaledFloat32x4
OpCeilScaledFloat32x8
@@ -90401,156 +90431,6 @@
generic: true,
},
{
- name: "Broadcast1To2Float64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To2Int64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To2Uint64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Float32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Float64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Int32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Int64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Uint32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Uint64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Float32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Float64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Int16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Int32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Int64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Uint16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Uint32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Uint64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Float32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Int8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Int16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Int32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Uint8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Uint16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Uint32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Int8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Int16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Uint8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Uint16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To64Int8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To64Uint8x16",
- argLen: 1,
- generic: true,
- },
- {
name: "CeilFloat32x4",
argLen: 1,
generic: true,
@@ -95572,6 +95452,306 @@
generic: true,
},
{
+ name: "broadcast1To2Float64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2Int64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2MaskedFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2MaskedInt64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2MaskedUint64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2Uint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Float32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Float64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Int32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Int64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedInt64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedUint64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Uint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Uint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Float32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Float64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Int16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Int32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Int64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedInt64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedUint64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Uint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Uint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Uint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Float32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Int8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Int16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Int32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedInt8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Uint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Uint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Uint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Int8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Int16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedInt8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Uint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Uint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64Int8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64MaskedInt8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64MaskedUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64Uint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
name: "AESRoundKeyGenAssistUint32x4",
auxType: auxUInt8,
argLen: 1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index e9bb9f1..dc3c553 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2557,96 +2557,6 @@
return rewriteValueAMD64_OpBitLen64(v)
case OpBitLen8:
return rewriteValueAMD64_OpBitLen8(v)
- case OpBroadcast1To16Float32x4:
- v.Op = OpAMD64VBROADCASTSS512
- return true
- case OpBroadcast1To16Int16x8:
- v.Op = OpAMD64VPBROADCASTW256
- return true
- case OpBroadcast1To16Int32x4:
- v.Op = OpAMD64VPBROADCASTD512
- return true
- case OpBroadcast1To16Int8x16:
- v.Op = OpAMD64VPBROADCASTB128
- return true
- case OpBroadcast1To16Uint16x8:
- v.Op = OpAMD64VPBROADCASTW256
- return true
- case OpBroadcast1To16Uint32x4:
- v.Op = OpAMD64VPBROADCASTD512
- return true
- case OpBroadcast1To16Uint8x16:
- v.Op = OpAMD64VPBROADCASTB128
- return true
- case OpBroadcast1To2Float64x2:
- v.Op = OpAMD64VPBROADCASTQ128
- return true
- case OpBroadcast1To2Int64x2:
- v.Op = OpAMD64VPBROADCASTQ128
- return true
- case OpBroadcast1To2Uint64x2:
- v.Op = OpAMD64VPBROADCASTQ128
- return true
- case OpBroadcast1To32Int16x8:
- v.Op = OpAMD64VPBROADCASTW512
- return true
- case OpBroadcast1To32Int8x16:
- v.Op = OpAMD64VPBROADCASTB256
- return true
- case OpBroadcast1To32Uint16x8:
- v.Op = OpAMD64VPBROADCASTW512
- return true
- case OpBroadcast1To32Uint8x16:
- v.Op = OpAMD64VPBROADCASTB256
- return true
- case OpBroadcast1To4Float32x4:
- v.Op = OpAMD64VBROADCASTSS128
- return true
- case OpBroadcast1To4Float64x2:
- v.Op = OpAMD64VBROADCASTSD256
- return true
- case OpBroadcast1To4Int32x4:
- v.Op = OpAMD64VPBROADCASTD128
- return true
- case OpBroadcast1To4Int64x2:
- v.Op = OpAMD64VPBROADCASTQ256
- return true
- case OpBroadcast1To4Uint32x4:
- v.Op = OpAMD64VPBROADCASTD128
- return true
- case OpBroadcast1To4Uint64x2:
- v.Op = OpAMD64VPBROADCASTQ256
- return true
- case OpBroadcast1To64Int8x16:
- v.Op = OpAMD64VPBROADCASTB512
- return true
- case OpBroadcast1To64Uint8x16:
- v.Op = OpAMD64VPBROADCASTB512
- return true
- case OpBroadcast1To8Float32x4:
- v.Op = OpAMD64VBROADCASTSS256
- return true
- case OpBroadcast1To8Float64x2:
- v.Op = OpAMD64VBROADCASTSD512
- return true
- case OpBroadcast1To8Int16x8:
- v.Op = OpAMD64VPBROADCASTW128
- return true
- case OpBroadcast1To8Int32x4:
- v.Op = OpAMD64VPBROADCASTD256
- return true
- case OpBroadcast1To8Int64x2:
- v.Op = OpAMD64VPBROADCASTQ512
- return true
- case OpBroadcast1To8Uint16x8:
- v.Op = OpAMD64VPBROADCASTW128
- return true
- case OpBroadcast1To8Uint32x4:
- v.Op = OpAMD64VPBROADCASTD256
- return true
- case OpBroadcast1To8Uint64x2:
- v.Op = OpAMD64VPBROADCASTQ512
- return true
case OpBswap16:
return rewriteValueAMD64_OpBswap16(v)
case OpBswap32:
@@ -6360,6 +6270,156 @@
return rewriteValueAMD64_OpblendMaskedInt64x8(v)
case OpblendMaskedInt8x64:
return rewriteValueAMD64_OpblendMaskedInt8x64(v)
+ case Opbroadcast1To16Float32x4:
+ v.Op = OpAMD64VBROADCASTSS512
+ return true
+ case Opbroadcast1To16Int16x8:
+ v.Op = OpAMD64VPBROADCASTW256
+ return true
+ case Opbroadcast1To16Int32x4:
+ v.Op = OpAMD64VPBROADCASTD512
+ return true
+ case Opbroadcast1To16Int8x16:
+ v.Op = OpAMD64VPBROADCASTB128
+ return true
+ case Opbroadcast1To16MaskedFloat32x4:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedFloat32x4(v)
+ case Opbroadcast1To16MaskedInt16x8:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedInt16x8(v)
+ case Opbroadcast1To16MaskedInt32x4:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedInt32x4(v)
+ case Opbroadcast1To16MaskedInt8x16:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedInt8x16(v)
+ case Opbroadcast1To16MaskedUint16x8:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedUint16x8(v)
+ case Opbroadcast1To16MaskedUint32x4:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedUint32x4(v)
+ case Opbroadcast1To16MaskedUint8x16:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedUint8x16(v)
+ case Opbroadcast1To16Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW256
+ return true
+ case Opbroadcast1To16Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD512
+ return true
+ case Opbroadcast1To16Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB128
+ return true
+ case Opbroadcast1To2Float64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
+ return true
+ case Opbroadcast1To2Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
+ return true
+ case Opbroadcast1To2MaskedFloat64x2:
+ return rewriteValueAMD64_Opbroadcast1To2MaskedFloat64x2(v)
+ case Opbroadcast1To2MaskedInt64x2:
+ return rewriteValueAMD64_Opbroadcast1To2MaskedInt64x2(v)
+ case Opbroadcast1To2MaskedUint64x2:
+ return rewriteValueAMD64_Opbroadcast1To2MaskedUint64x2(v)
+ case Opbroadcast1To2Uint64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
+ return true
+ case Opbroadcast1To32Int16x8:
+ v.Op = OpAMD64VPBROADCASTW512
+ return true
+ case Opbroadcast1To32Int8x16:
+ v.Op = OpAMD64VPBROADCASTB256
+ return true
+ case Opbroadcast1To32MaskedInt16x8:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedInt16x8(v)
+ case Opbroadcast1To32MaskedInt8x16:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedInt8x16(v)
+ case Opbroadcast1To32MaskedUint16x8:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedUint16x8(v)
+ case Opbroadcast1To32MaskedUint8x16:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedUint8x16(v)
+ case Opbroadcast1To32Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW512
+ return true
+ case Opbroadcast1To32Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB256
+ return true
+ case Opbroadcast1To4Float32x4:
+ v.Op = OpAMD64VBROADCASTSS128
+ return true
+ case Opbroadcast1To4Float64x2:
+ v.Op = OpAMD64VBROADCASTSD256
+ return true
+ case Opbroadcast1To4Int32x4:
+ v.Op = OpAMD64VPBROADCASTD128
+ return true
+ case Opbroadcast1To4Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ256
+ return true
+ case Opbroadcast1To4MaskedFloat32x4:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedFloat32x4(v)
+ case Opbroadcast1To4MaskedFloat64x2:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedFloat64x2(v)
+ case Opbroadcast1To4MaskedInt32x4:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedInt32x4(v)
+ case Opbroadcast1To4MaskedInt64x2:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedInt64x2(v)
+ case Opbroadcast1To4MaskedUint32x4:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedUint32x4(v)
+ case Opbroadcast1To4MaskedUint64x2:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedUint64x2(v)
+ case Opbroadcast1To4Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD128
+ return true
+ case Opbroadcast1To4Uint64x2:
+ v.Op = OpAMD64VPBROADCASTQ256
+ return true
+ case Opbroadcast1To64Int8x16:
+ v.Op = OpAMD64VPBROADCASTB512
+ return true
+ case Opbroadcast1To64MaskedInt8x16:
+ return rewriteValueAMD64_Opbroadcast1To64MaskedInt8x16(v)
+ case Opbroadcast1To64MaskedUint8x16:
+ return rewriteValueAMD64_Opbroadcast1To64MaskedUint8x16(v)
+ case Opbroadcast1To64Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB512
+ return true
+ case Opbroadcast1To8Float32x4:
+ v.Op = OpAMD64VBROADCASTSS256
+ return true
+ case Opbroadcast1To8Float64x2:
+ v.Op = OpAMD64VBROADCASTSD512
+ return true
+ case Opbroadcast1To8Int16x8:
+ v.Op = OpAMD64VPBROADCASTW128
+ return true
+ case Opbroadcast1To8Int32x4:
+ v.Op = OpAMD64VPBROADCASTD256
+ return true
+ case Opbroadcast1To8Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ512
+ return true
+ case Opbroadcast1To8MaskedFloat32x4:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedFloat32x4(v)
+ case Opbroadcast1To8MaskedFloat64x2:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedFloat64x2(v)
+ case Opbroadcast1To8MaskedInt16x8:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedInt16x8(v)
+ case Opbroadcast1To8MaskedInt32x4:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedInt32x4(v)
+ case Opbroadcast1To8MaskedInt64x2:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedInt64x2(v)
+ case Opbroadcast1To8MaskedUint16x8:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedUint16x8(v)
+ case Opbroadcast1To8MaskedUint32x4:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedUint32x4(v)
+ case Opbroadcast1To8MaskedUint64x2:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedUint64x2(v)
+ case Opbroadcast1To8Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW128
+ return true
+ case Opbroadcast1To8Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD256
+ return true
+ case Opbroadcast1To8Uint64x2:
+ v.Op = OpAMD64VPBROADCASTQ512
+ return true
case OpcarrylessMultiplyUint64x2:
v.Op = OpAMD64VPCLMULQDQ128
return true
@@ -33324,18 +33384,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU16Masked128 (VPBROADCASTW128 x) mask)
- // result: (VPBROADCASTWMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTW128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTWMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
// result: (VPERMI2WMasked128 x y z mask)
for {
@@ -33890,18 +33938,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU16Masked256 (VPBROADCASTW256 x) mask)
- // result: (VPBROADCASTWMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTW256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTWMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
// result: (VPERMI2WMasked256 x y z mask)
for {
@@ -34492,18 +34528,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU16Masked512 (VPBROADCASTW512 x) mask)
- // result: (VPBROADCASTWMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTW512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTWMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
// result: (VPERMI2WMasked512 x y z mask)
for {
@@ -34996,30 +35020,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked128 (VBROADCASTSS128 x) mask)
- // result: (VBROADCASTSSMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSS128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSSMasked128)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU32Masked128 (VPBROADCASTD128 x) mask)
- // result: (VPBROADCASTDMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTD128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTDMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask)
// result: (VRNDSCALEPSMasked128 [a] x mask)
for {
@@ -35769,30 +35769,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked256 (VBROADCASTSS256 x) mask)
- // result: (VBROADCASTSSMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSS256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSSMasked256)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU32Masked256 (VPBROADCASTD256 x) mask)
- // result: (VPBROADCASTDMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTD256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTDMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask)
// result: (VRNDSCALEPSMasked256 [a] x mask)
for {
@@ -36690,30 +36666,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked512 (VBROADCASTSS512 x) mask)
- // result: (VBROADCASTSSMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSS512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSSMasked512)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU32Masked512 (VPBROADCASTD512 x) mask)
- // result: (VPBROADCASTDMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTD512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTDMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask)
// result: (VRNDSCALEPSMasked512 [a] x mask)
for {
@@ -37563,18 +37515,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask)
- // result: (VPBROADCASTQMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTQ128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTQMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU64Masked128 (VRNDSCALEPD128 [a] x) mask)
// result: (VRNDSCALEPDMasked128 [a] x mask)
for {
@@ -38440,30 +38380,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked256 (VBROADCASTSD256 x) mask)
- // result: (VBROADCASTSDMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSD256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSDMasked256)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask)
- // result: (VPBROADCASTQMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTQ256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTQMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU64Masked256 (VRNDSCALEPD256 [a] x) mask)
// result: (VRNDSCALEPDMasked256 [a] x mask)
for {
@@ -39357,30 +39273,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked512 (VBROADCASTSD512 x) mask)
- // result: (VBROADCASTSDMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSD512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSDMasked512)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask)
- // result: (VPBROADCASTQMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTQ512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTQMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU64Masked512 (VRNDSCALEPD512 [a] x) mask)
// result: (VRNDSCALEPDMasked512 [a] x mask)
for {
@@ -40168,18 +40060,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked128 (VPBROADCASTB128 x) mask)
- // result: (VPBROADCASTBMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTB128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTBMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
// result: (VPERMI2BMasked128 x y z mask)
for {
@@ -40522,18 +40402,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked256 (VPBROADCASTB256 x) mask)
- // result: (VPBROADCASTBMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTB256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTBMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
// result: (VPERMI2BMasked256 x y z mask)
for {
@@ -40876,18 +40744,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked512 (VPBROADCASTB512 x) mask)
- // result: (VPBROADCASTBMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTB512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTBMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
// result: (VPERMI2BMasked512 x y z mask)
for {
@@ -79558,6 +79414,486 @@
return true
}
}
+func rewriteValueAMD64_Opbroadcast1To16MaskedFloat32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedFloat32x4 x mask)
+ // result: (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSSMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedInt16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedInt16x8 x mask)
+ // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedInt32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedInt32x4 x mask)
+ // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedInt8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedInt8x16 x mask)
+ // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedUint16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedUint16x8 x mask)
+ // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedUint32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedUint32x4 x mask)
+ // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedUint8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedUint8x16 x mask)
+ // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To2MaskedFloat64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To2MaskedFloat64x2 x mask)
+ // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To2MaskedInt64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To2MaskedInt64x2 x mask)
+ // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To2MaskedUint64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To2MaskedUint64x2 x mask)
+ // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedInt16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedInt16x8 x mask)
+ // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedInt8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedInt8x16 x mask)
+ // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedUint16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedUint16x8 x mask)
+ // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedUint8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedUint8x16 x mask)
+ // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedFloat32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedFloat32x4 x mask)
+ // result: (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSSMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedFloat64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedFloat64x2 x mask)
+ // result: (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedInt32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedInt32x4 x mask)
+ // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedInt64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedInt64x2 x mask)
+ // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedUint32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedUint32x4 x mask)
+ // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedUint64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedUint64x2 x mask)
+ // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To64MaskedInt8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To64MaskedInt8x16 x mask)
+ // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To64MaskedUint8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To64MaskedUint8x16 x mask)
+ // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedFloat32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedFloat32x4 x mask)
+ // result: (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSSMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedFloat64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedFloat64x2 x mask)
+ // result: (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedInt16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedInt16x8 x mask)
+ // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedInt32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedInt32x4 x mask)
+ // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedInt64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedInt64x2 x mask)
+ // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedUint16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedUint16x8 x mask)
+ // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedUint32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedUint32x4 x mask)
+ // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedUint64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedUint64x2 x mask)
+ // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
func rewriteBlockAMD64(b *Block) bool {
typ := &b.Func.Config.Types
switch b.Kind {
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index cc298c0..5884c31 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -150,36 +150,6 @@
addF(simdPackage, "Uint16x8.Average", opLen2(ssa.OpAverageUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.Average", opLen2(ssa.OpAverageUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.Average", opLen2(ssa.OpAverageUint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Float32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int16x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int8x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int16x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Int8x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
@@ -1295,6 +1265,66 @@
addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To2", opLen1(ssa.Opbroadcast1To2Float64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To2", opLen1(ssa.Opbroadcast1To2Int64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To2", opLen1(ssa.Opbroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To2Masked", opLen2(ssa.Opbroadcast1To2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To2Masked", opLen2(ssa.Opbroadcast1To2MaskedInt64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To2Masked", opLen2(ssa.Opbroadcast1To2MaskedUint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To4", opLen1(ssa.Opbroadcast1To4Float32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To4", opLen1(ssa.Opbroadcast1To4Float64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To4", opLen1(ssa.Opbroadcast1To4Int32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To4", opLen1(ssa.Opbroadcast1To4Int64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To4", opLen1(ssa.Opbroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To4", opLen1(ssa.Opbroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedFloat32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedFloat64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedInt64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedUint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedUint64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To8", opLen1(ssa.Opbroadcast1To8Float32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To8", opLen1(ssa.Opbroadcast1To8Float64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To8", opLen1(ssa.Opbroadcast1To8Int16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To8", opLen1(ssa.Opbroadcast1To8Int32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To8", opLen1(ssa.Opbroadcast1To8Int64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedFloat32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedFloat64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedInt32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedInt64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedUint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedUint32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedUint64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To16", opLen1(ssa.Opbroadcast1To16Float32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedFloat32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedInt8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedInt16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedInt32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedUint16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedUint32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To32", opLen1(ssa.Opbroadcast1To32Int8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To32", opLen1(ssa.Opbroadcast1To32Int16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To32", opLen1(ssa.Opbroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To32", opLen1(ssa.Opbroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedInt8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedInt16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedUint8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedUint16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To64", opLen1(ssa.Opbroadcast1To64Int8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To64", opLen1(ssa.Opbroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To64Masked", opLen2(ssa.Opbroadcast1To64MaskedInt8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To64Masked", opLen2(ssa.Opbroadcast1To64MaskedUint8x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 3f8489c..70c8178 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -70,32 +70,32 @@
commutative: false
documentation: !string |-
// NAME expands the lower elements of x into the masked elements of z.
-- go: Broadcast1To2
+- go: broadcast1To2
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 2 elements of
// the output vector.
-- go: Broadcast1To4
+- go: broadcast1To4
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 4 elements of
// the output vector.
-- go: Broadcast1To8
+- go: broadcast1To8
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 8 elements of
// the output vector.
-- go: Broadcast1To16
+- go: broadcast1To16
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 16 elements of
// the output vector.
-- go: Broadcast1To32
+- go: broadcast1To32
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 32 elements of
// the output vector.
-- go: Broadcast1To64
+- go: broadcast1To64
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 64 elements of
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index a709c3d..1c0e371 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -347,7 +347,7 @@
out:
- go: $t

-- go: Broadcast1To2
+- go: broadcast1To2
asm: VPBROADCASTQ
in:
- class: vreg
@@ -361,7 +361,7 @@
base: $b

# weirdly, this one case on AVX2 is memory-operand-only
-- go: Broadcast1To2
+- go: broadcast1To2
asm: VPBROADCASTQ
in:
- class: vreg
@@ -376,7 +376,7 @@
base: int
OverwriteBase: float

-- go: Broadcast1To4
+- go: broadcast1To4
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -387,7 +387,7 @@
lanes: 4
base: $b

-- go: Broadcast1To8
+- go: broadcast1To8
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -398,7 +398,7 @@
lanes: 8
base: $b

-- go: Broadcast1To16
+- go: broadcast1To16
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -409,7 +409,7 @@
lanes: 16
base: $b

-- go: Broadcast1To32
+- go: broadcast1To32
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -420,7 +420,7 @@
lanes: 32
base: $b

-- go: Broadcast1To64
+- go: broadcast1To64
asm: VPBROADCASTB
in:
- class: vreg
@@ -431,7 +431,7 @@
lanes: 64
base: $b

-- go: Broadcast1To4
+- go: broadcast1To4
asm: VBROADCASTS[SD]
in:
- class: vreg
@@ -442,7 +442,7 @@
lanes: 4
base: float

-- go: Broadcast1To8
+- go: broadcast1To8
asm: VBROADCASTS[SD]
in:
- class: vreg
@@ -453,7 +453,7 @@
lanes: 8
base: float

-- go: Broadcast1To16
+- go: broadcast1To16
asm: VBROADCASTS[SD]
in:
- class: vreg
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index d6708eb..20db7c9 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -898,7 +898,7 @@
// Emulated, CPU Feature: {{.CPUfeatureBC}}
func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
var z {{.As128BitVec }}
- return z.SetElem(0, x).Broadcast1To{{.Count}}()
+ return z.SetElem(0, x).broadcast1To{{.Count}}()
}
`)

diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index d17c1d9..2ee694c 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -805,198 +805,6 @@
// Asm: VPAVGW, CPU Feature: AVX512
func (x Uint16x32) Average(y Uint16x32) Uint16x32

-/* Broadcast1To2 */
-
-// Broadcast1To2 copies the lowest element of its input to all 2 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Float64x2) Broadcast1To2() Float64x2
-
-// Broadcast1To2 copies the lowest element of its input to all 2 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast1To2() Int64x2
-
-// Broadcast1To2 copies the lowest element of its input to all 2 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast1To2() Uint64x2
-
-/* Broadcast1To4 */
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast1To4() Float32x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX2
-func (x Float64x2) Broadcast1To4() Float64x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast1To4() Int32x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast1To4() Int64x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast1To4() Uint32x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast1To4() Uint64x4
-
-/* Broadcast1To8 */
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast1To8() Float32x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX512
-func (x Float64x2) Broadcast1To8() Float64x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast1To8() Int16x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast1To8() Int32x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Int64x2) Broadcast1To8() Int64x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast1To8() Uint16x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast1To8() Uint32x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Uint64x2) Broadcast1To8() Uint64x8
-
-/* Broadcast1To16 */
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX512
-func (x Float32x4) Broadcast1To16() Float32x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast1To16() Int8x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast1To16() Int16x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Int32x4) Broadcast1To16() Int32x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast1To16() Uint8x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast1To16() Uint16x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Uint32x4) Broadcast1To16() Uint32x16
-
-/* Broadcast1To32 */
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast1To32() Int8x32
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Int16x8) Broadcast1To32() Int16x32
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast1To32() Uint8x32
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Uint16x8) Broadcast1To32() Uint16x32
-
-/* Broadcast1To64 */
-
-// Broadcast1To64 copies the lowest element of its input to all 64 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Int8x16) Broadcast1To64() Int8x64
-
-// Broadcast1To64 copies the lowest element of its input to all 64 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Uint8x16) Broadcast1To64() Uint8x64
-
/* Ceil */

// Ceil rounds elements up to the nearest integer.
diff --git a/src/simd/archsimd/ops_internal_amd64.go b/src/simd/archsimd/ops_internal_amd64.go
index 10749c9..d61f442 100644
--- a/src/simd/archsimd/ops_internal_amd64.go
+++ b/src/simd/archsimd/ops_internal_amd64.go
@@ -52,6 +52,450 @@
// Asm: VPBLENDMQ, CPU Feature: AVX512
func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8

+/* broadcast1To2 */
+
+// broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Float64x2) broadcast1To2() Float64x2
+
+// broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) broadcast1To2() Int64x2
+
+// broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) broadcast1To2() Uint64x2
+
+/* broadcast1To2Masked */
+
+// broadcast1To2Masked copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Float64x2) broadcast1To2Masked(mask Mask64x2) Float64x2
+
+// broadcast1To2Masked copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To2Masked(mask Mask64x2) Int64x2
+
+// broadcast1To2Masked copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To2Masked(mask Mask64x2) Uint64x2
+
+/* broadcast1To4 */
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) broadcast1To4() Float32x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX2
+func (x Float64x2) broadcast1To4() Float64x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) broadcast1To4() Int32x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) broadcast1To4() Int64x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) broadcast1To4() Uint32x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) broadcast1To4() Uint64x4
+
+/* broadcast1To4Masked */
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To4Masked(mask Mask32x4) Float32x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) broadcast1To4Masked(mask Mask64x2) Float64x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To4Masked(mask Mask32x4) Int32x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To4Masked(mask Mask64x2) Int64x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To4Masked(mask Mask32x4) Uint32x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To4Masked(mask Mask64x2) Uint64x4
+
+/* broadcast1To8 */
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) broadcast1To8() Float32x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) broadcast1To8() Float64x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) broadcast1To8() Int16x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) broadcast1To8() Int32x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To8() Int64x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) broadcast1To8() Uint16x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) broadcast1To8() Uint32x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To8() Uint64x8
+
+/* broadcast1To8Masked */
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To8Masked(mask Mask32x4) Float32x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) broadcast1To8Masked(mask Mask64x2) Float64x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To8Masked(mask Mask16x8) Int16x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To8Masked(mask Mask32x4) Int32x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To8Masked(mask Mask64x2) Int64x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To8Masked(mask Mask16x8) Uint16x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To8Masked(mask Mask32x4) Uint32x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To8Masked(mask Mask64x2) Uint64x8
+
+/* broadcast1To16 */
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To16() Float32x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) broadcast1To16() Int8x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) broadcast1To16() Int16x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To16() Int32x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) broadcast1To16() Uint8x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) broadcast1To16() Uint16x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To16() Uint32x16
+
+/* broadcast1To16Masked */
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To16Masked(mask Mask32x4) Float32x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To16Masked(mask Mask8x16) Int8x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To16Masked(mask Mask16x8) Int16x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To16Masked(mask Mask32x4) Int32x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To16Masked(mask Mask8x16) Uint8x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To16Masked(mask Mask16x8) Uint16x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To16Masked(mask Mask32x4) Uint32x16
+
+/* broadcast1To32 */
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) broadcast1To32() Int8x32
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To32() Int16x32
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) broadcast1To32() Uint8x32
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To32() Uint16x32
+
+/* broadcast1To32Masked */
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To32Masked(mask Mask8x16) Int8x32
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To32Masked(mask Mask16x8) Int16x32
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To32Masked(mask Mask8x16) Uint8x32
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To32Masked(mask Mask16x8) Uint16x32
+
+/* broadcast1To64 */
+
+// broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To64() Int8x64
+
+// broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To64() Uint8x64
+
+/* broadcast1To64Masked */
+
+// broadcast1To64Masked copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To64Masked(mask Mask8x16) Int8x64
+
+// broadcast1To64Masked copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To64Masked(mask Mask8x16) Uint8x64
+
/* carrylessMultiply */

// carrylessMultiply computes one of four possible Galois polynomial
diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go
index 61b71e3..6ece31e 100644
--- a/src/simd/archsimd/other_gen_amd64.go
+++ b/src/simd/archsimd/other_gen_amd64.go
@@ -10,7 +10,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt8x16(x int8) Int8x16 {
var z Int8x16
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastInt16x8 returns a vector with the input
@@ -19,7 +19,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt16x8(x int16) Int16x8 {
var z Int16x8
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastInt32x4 returns a vector with the input
@@ -28,7 +28,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt32x4(x int32) Int32x4 {
var z Int32x4
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}

// BroadcastInt64x2 returns a vector with the input
@@ -37,7 +37,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt64x2(x int64) Int64x2 {
var z Int64x2
- return z.SetElem(0, x).Broadcast1To2()
+ return z.SetElem(0, x).broadcast1To2()
}

// BroadcastUint8x16 returns a vector with the input
@@ -46,7 +46,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint8x16(x uint8) Uint8x16 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastUint16x8 returns a vector with the input
@@ -55,7 +55,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint16x8(x uint16) Uint16x8 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastUint32x4 returns a vector with the input
@@ -64,7 +64,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint32x4(x uint32) Uint32x4 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}

// BroadcastUint64x2 returns a vector with the input
@@ -73,7 +73,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint64x2(x uint64) Uint64x2 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast1To2()
+ return z.SetElem(0, x).broadcast1To2()
}

// BroadcastFloat32x4 returns a vector with the input
@@ -82,7 +82,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat32x4(x float32) Float32x4 {
var z Float32x4
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}

// BroadcastFloat64x2 returns a vector with the input
@@ -91,7 +91,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat64x2(x float64) Float64x2 {
var z Float64x2
- return z.SetElem(0, x).Broadcast1To2()
+ return z.SetElem(0, x).broadcast1To2()
}

// BroadcastInt8x32 returns a vector with the input
@@ -100,7 +100,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt8x32(x int8) Int8x32 {
var z Int8x16
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}

// BroadcastInt16x16 returns a vector with the input
@@ -109,7 +109,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt16x16(x int16) Int16x16 {
var z Int16x8
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastInt32x8 returns a vector with the input
@@ -118,7 +118,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt32x8(x int32) Int32x8 {
var z Int32x4
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastInt64x4 returns a vector with the input
@@ -127,7 +127,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt64x4(x int64) Int64x4 {
var z Int64x2
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}

// BroadcastUint8x32 returns a vector with the input
@@ -136,7 +136,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint8x32(x uint8) Uint8x32 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}

// BroadcastUint16x16 returns a vector with the input
@@ -145,7 +145,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint16x16(x uint16) Uint16x16 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastUint32x8 returns a vector with the input
@@ -154,7 +154,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint32x8(x uint32) Uint32x8 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastUint64x4 returns a vector with the input
@@ -163,7 +163,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint64x4(x uint64) Uint64x4 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}

// BroadcastFloat32x8 returns a vector with the input
@@ -172,7 +172,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat32x8(x float32) Float32x8 {
var z Float32x4
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastFloat64x4 returns a vector with the input
@@ -181,7 +181,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat64x4(x float64) Float64x4 {
var z Float64x2
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}

// BroadcastInt8x64 returns a vector with the input
@@ -190,7 +190,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastInt8x64(x int8) Int8x64 {
var z Int8x16
- return z.SetElem(0, x).Broadcast1To64()
+ return z.SetElem(0, x).broadcast1To64()
}

// BroadcastInt16x32 returns a vector with the input
@@ -199,7 +199,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastInt16x32(x int16) Int16x32 {
var z Int16x8
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}

// BroadcastInt32x16 returns a vector with the input
@@ -208,7 +208,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastInt32x16(x int32) Int32x16 {
var z Int32x4
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastInt64x8 returns a vector with the input
@@ -217,7 +217,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastInt64x8(x int64) Int64x8 {
var z Int64x2
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastUint8x64 returns a vector with the input
@@ -226,7 +226,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastUint8x64(x uint8) Uint8x64 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast1To64()
+ return z.SetElem(0, x).broadcast1To64()
}

// BroadcastUint16x32 returns a vector with the input
@@ -235,7 +235,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastUint16x32(x uint16) Uint16x32 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}

// BroadcastUint32x16 returns a vector with the input
@@ -244,7 +244,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastUint32x16(x uint32) Uint32x16 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastUint64x8 returns a vector with the input
@@ -253,7 +253,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastUint64x8(x uint64) Uint64x8 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// BroadcastFloat32x16 returns a vector with the input
@@ -262,7 +262,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastFloat32x16(x float32) Float32x16 {
var z Float32x4
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}

// BroadcastFloat64x8 returns a vector with the input
@@ -271,7 +271,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastFloat64x8(x float64) Float64x8 {
var z Float64x2
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}

// ToMask returns a mask whose i'th element is set if x[i] is non-zero.

Change information

Files:
  • M src/cmd/compile/internal/amd64/simdssa.go
  • M src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
  • M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
  • M src/cmd/compile/internal/ssa/opGen.go
  • M src/cmd/compile/internal/ssa/rewriteAMD64.go
  • M src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
  • M src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
  • M src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
  • M src/simd/archsimd/_gen/tmplgen/main.go
  • M src/simd/archsimd/ops_amd64.go
  • M src/simd/archsimd/ops_internal_amd64.go
  • M src/simd/archsimd/other_gen_amd64.go
Change size: XL
Delta: 12 files changed, 1730 insertions(+), 889 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: Ied1b159a68b54785254b35ea61145dfdc27832b6
Gerrit-Change-Number: 775622
Gerrit-PatchSet: 1
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

Junyang Shao (Gerrit)

unread,
May 8, 2026, 4:48:17 PM (3 days ago) May 8
to goph...@pubsubhelper.golang.org, David Chase, Cherry Mui, golang-co...@googlegroups.com
Attention needed from Cherry Mui and David Chase

Junyang Shao voted Commit-Queue+1

Commit-Queue+1
Open in Gerrit

Related details

Attention is currently required from:
  • Cherry Mui
  • David Chase
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: comment
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: Ied1b159a68b54785254b35ea61145dfdc27832b6
Gerrit-Change-Number: 775622
Gerrit-PatchSet: 1
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
Gerrit-Reviewer: Cherry Mui <cher...@google.com>
Gerrit-Reviewer: David Chase <drc...@google.com>
Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
Gerrit-Attention: Cherry Mui <cher...@google.com>
Gerrit-Attention: David Chase <drc...@google.com>
Gerrit-Comment-Date: Fri, 08 May 2026 20:48:14 +0000
Gerrit-HasComments: No
Gerrit-Has-Labels: Yes
unsatisfied_requirement
satisfied_requirement
open
diffy

Junyang Shao (Gerrit)

unread,
May 8, 2026, 5:11:37 PM (3 days ago) May 8
to goph...@pubsubhelper.golang.org, golang...@luci-project-accounts.iam.gserviceaccount.com, David Chase, Cherry Mui, golang-co...@googlegroups.com
Attention needed from Cherry Mui and David Chase

Junyang Shao voted Auto-Submit+1

Auto-Submit+1
Open in Gerrit

Related details

Attention is currently required from:
  • Cherry Mui
  • David Chase
Submit Requirements:
    • requirement is not satisfiedCode-Review
    • requirement satisfiedNo-Unresolved-Comments
    • requirement is not satisfiedReview-Enforcement
    • requirement satisfiedTryBots-Pass
    Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
    Gerrit-MessageType: comment
    Gerrit-Project: go
    Gerrit-Branch: dev.simd
    Gerrit-Change-Id: Ied1b159a68b54785254b35ea61145dfdc27832b6
    Gerrit-Change-Number: 775622
    Gerrit-PatchSet: 1
    Gerrit-Owner: Junyang Shao <shaoj...@google.com>
    Gerrit-Reviewer: Cherry Mui <cher...@google.com>
    Gerrit-Reviewer: David Chase <drc...@google.com>
    Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
    Gerrit-Attention: Cherry Mui <cher...@google.com>
    Gerrit-Attention: David Chase <drc...@google.com>
    Gerrit-Comment-Date: Fri, 08 May 2026 21:11:34 +0000
    Gerrit-HasComments: No
    Gerrit-Has-Labels: Yes
    unsatisfied_requirement
    satisfied_requirement
    open
    diffy

    David Chase (Gerrit)

    unread,
    5:52 PM (5 hours ago) 5:52 PM
    to Junyang Shao, goph...@pubsubhelper.golang.org, Austin Clements, golang...@luci-project-accounts.iam.gserviceaccount.com, Cherry Mui, golang-co...@googlegroups.com
    Attention needed from Austin Clements, Cherry Mui and Junyang Shao

    David Chase added 1 comment

    Patchset-level comments
    File-level comment, Patchset 3 (Latest):
    David Chase . resolved

    I'm trying to be sure that this is what we wanted to do, I keep on thinking to look at the CL when I am tired.

    Open in Gerrit

    Related details

    Attention is currently required from:
    • Austin Clements
    • Cherry Mui
    • Junyang Shao
    Submit Requirements:
      • requirement is not satisfiedCode-Review
      • requirement satisfiedNo-Unresolved-Comments
      • requirement is not satisfiedReview-Enforcement
      • requirement is not satisfiedTryBots-Pass
      Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
      Gerrit-MessageType: comment
      Gerrit-Project: go
      Gerrit-Branch: dev.simd
      Gerrit-Change-Id: Ied1b159a68b54785254b35ea61145dfdc27832b6
      Gerrit-Change-Number: 775622
      Gerrit-PatchSet: 3
      Gerrit-Owner: Junyang Shao <shaoj...@google.com>
      Gerrit-Reviewer: Austin Clements <aus...@google.com>
      Gerrit-Reviewer: Cherry Mui <cher...@google.com>
      Gerrit-Reviewer: David Chase <drc...@google.com>
      Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
      Gerrit-Attention: Cherry Mui <cher...@google.com>
      Gerrit-Attention: Junyang Shao <shaoj...@google.com>
      Gerrit-Attention: Austin Clements <aus...@google.com>
      Gerrit-Comment-Date: Mon, 11 May 2026 21:52:11 +0000
      Gerrit-HasComments: Yes
      Gerrit-Has-Labels: No
      unsatisfied_requirement
      satisfied_requirement
      open
      diffy
      Reply all
      Reply to author
      Forward
      0 new messages