diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index f94b299..7659601 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -25,23 +25,6 @@
ssa.OpAMD64VPABSQ128,
ssa.OpAMD64VPABSQ256,
ssa.OpAMD64VPABSQ512,
- ssa.OpAMD64VPBROADCASTQ128,
- ssa.OpAMD64VBROADCASTSS128,
- ssa.OpAMD64VBROADCASTSD256,
- ssa.OpAMD64VPBROADCASTD128,
- ssa.OpAMD64VPBROADCASTQ256,
- ssa.OpAMD64VBROADCASTSS256,
- ssa.OpAMD64VBROADCASTSD512,
- ssa.OpAMD64VPBROADCASTW128,
- ssa.OpAMD64VPBROADCASTD256,
- ssa.OpAMD64VPBROADCASTQ512,
- ssa.OpAMD64VBROADCASTSS512,
- ssa.OpAMD64VPBROADCASTB128,
- ssa.OpAMD64VPBROADCASTW256,
- ssa.OpAMD64VPBROADCASTD512,
- ssa.OpAMD64VPBROADCASTB256,
- ssa.OpAMD64VPBROADCASTW512,
- ssa.OpAMD64VPBROADCASTB512,
ssa.OpAMD64VCVTPD2PSX128,
ssa.OpAMD64VCVTPD2PSY128,
ssa.OpAMD64VCVTPD2PS256,
@@ -216,7 +199,24 @@
ssa.OpAMD64VPMOVQW128_512,
ssa.OpAMD64VPMOVQD128_128,
ssa.OpAMD64VPMOVQD128_256,
- ssa.OpAMD64VPMOVQD256:
+ ssa.OpAMD64VPMOVQD256,
+ ssa.OpAMD64VPBROADCASTQ128,
+ ssa.OpAMD64VBROADCASTSS128,
+ ssa.OpAMD64VBROADCASTSD256,
+ ssa.OpAMD64VPBROADCASTD128,
+ ssa.OpAMD64VPBROADCASTQ256,
+ ssa.OpAMD64VBROADCASTSS256,
+ ssa.OpAMD64VBROADCASTSD512,
+ ssa.OpAMD64VPBROADCASTW128,
+ ssa.OpAMD64VPBROADCASTD256,
+ ssa.OpAMD64VPBROADCASTQ512,
+ ssa.OpAMD64VBROADCASTSS512,
+ ssa.OpAMD64VPBROADCASTB128,
+ ssa.OpAMD64VPBROADCASTW256,
+ ssa.OpAMD64VPBROADCASTD512,
+ ssa.OpAMD64VPBROADCASTB256,
+ ssa.OpAMD64VPBROADCASTW512,
+ ssa.OpAMD64VPBROADCASTB512:
p = simdV11(s, v)
case ssa.OpAMD64VAESDECLAST128,
@@ -832,23 +832,6 @@
ssa.OpAMD64VPABSQMasked128,
ssa.OpAMD64VPABSQMasked256,
ssa.OpAMD64VPABSQMasked512,
- ssa.OpAMD64VPBROADCASTQMasked128,
- ssa.OpAMD64VBROADCASTSSMasked128,
- ssa.OpAMD64VBROADCASTSDMasked256,
- ssa.OpAMD64VPBROADCASTDMasked128,
- ssa.OpAMD64VPBROADCASTQMasked256,
- ssa.OpAMD64VBROADCASTSSMasked256,
- ssa.OpAMD64VBROADCASTSDMasked512,
- ssa.OpAMD64VPBROADCASTWMasked128,
- ssa.OpAMD64VPBROADCASTDMasked256,
- ssa.OpAMD64VPBROADCASTQMasked512,
- ssa.OpAMD64VBROADCASTSSMasked512,
- ssa.OpAMD64VPBROADCASTBMasked128,
- ssa.OpAMD64VPBROADCASTWMasked256,
- ssa.OpAMD64VPBROADCASTDMasked512,
- ssa.OpAMD64VPBROADCASTBMasked256,
- ssa.OpAMD64VPBROADCASTWMasked512,
- ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VCOMPRESSPSMasked128,
ssa.OpAMD64VCOMPRESSPSMasked256,
ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -1060,6 +1043,23 @@
ssa.OpAMD64VPMOVQDMasked128_128,
ssa.OpAMD64VPMOVQDMasked128_256,
ssa.OpAMD64VPMOVQDMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked128,
+ ssa.OpAMD64VBROADCASTSSMasked128,
+ ssa.OpAMD64VBROADCASTSDMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked128,
+ ssa.OpAMD64VPBROADCASTQMasked256,
+ ssa.OpAMD64VBROADCASTSSMasked256,
+ ssa.OpAMD64VBROADCASTSDMasked512,
+ ssa.OpAMD64VPBROADCASTWMasked128,
+ ssa.OpAMD64VPBROADCASTDMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked512,
+ ssa.OpAMD64VBROADCASTSSMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked128,
+ ssa.OpAMD64VPBROADCASTWMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked256,
+ ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VMOVDQU8Masked128,
ssa.OpAMD64VMOVDQU8Masked256,
ssa.OpAMD64VMOVDQU8Masked512,
@@ -2448,23 +2448,6 @@
ssa.OpAMD64VPABSQMasked128Merging,
ssa.OpAMD64VPABSQMasked256Merging,
ssa.OpAMD64VPABSQMasked512Merging,
- ssa.OpAMD64VPBROADCASTQMasked128Merging,
- ssa.OpAMD64VBROADCASTSSMasked128Merging,
- ssa.OpAMD64VBROADCASTSDMasked256Merging,
- ssa.OpAMD64VPBROADCASTDMasked128Merging,
- ssa.OpAMD64VPBROADCASTQMasked256Merging,
- ssa.OpAMD64VBROADCASTSSMasked256Merging,
- ssa.OpAMD64VBROADCASTSDMasked512Merging,
- ssa.OpAMD64VPBROADCASTWMasked128Merging,
- ssa.OpAMD64VPBROADCASTDMasked256Merging,
- ssa.OpAMD64VPBROADCASTQMasked512Merging,
- ssa.OpAMD64VBROADCASTSSMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked128Merging,
- ssa.OpAMD64VPBROADCASTWMasked256Merging,
- ssa.OpAMD64VPBROADCASTDMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked256Merging,
- ssa.OpAMD64VPBROADCASTWMasked512Merging,
- ssa.OpAMD64VPBROADCASTBMasked512Merging,
ssa.OpAMD64VRNDSCALEPSMasked128Merging,
ssa.OpAMD64VRNDSCALEPSMasked256Merging,
ssa.OpAMD64VRNDSCALEPSMasked512Merging,
@@ -2664,6 +2647,23 @@
ssa.OpAMD64VPMOVQDMasked128_128Merging,
ssa.OpAMD64VPMOVQDMasked128_256Merging,
ssa.OpAMD64VPMOVQDMasked256Merging,
+ ssa.OpAMD64VPBROADCASTQMasked128Merging,
+ ssa.OpAMD64VBROADCASTSSMasked128Merging,
+ ssa.OpAMD64VBROADCASTSDMasked256Merging,
+ ssa.OpAMD64VPBROADCASTDMasked128Merging,
+ ssa.OpAMD64VPBROADCASTQMasked256Merging,
+ ssa.OpAMD64VBROADCASTSSMasked256Merging,
+ ssa.OpAMD64VBROADCASTSDMasked512Merging,
+ ssa.OpAMD64VPBROADCASTWMasked128Merging,
+ ssa.OpAMD64VPBROADCASTDMasked256Merging,
+ ssa.OpAMD64VPBROADCASTQMasked512Merging,
+ ssa.OpAMD64VBROADCASTSSMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked128Merging,
+ ssa.OpAMD64VPBROADCASTWMasked256Merging,
+ ssa.OpAMD64VPBROADCASTDMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked256Merging,
+ ssa.OpAMD64VPBROADCASTWMasked512Merging,
+ ssa.OpAMD64VPBROADCASTBMasked512Merging,
ssa.OpAMD64VPSHUFDMasked256Merging,
ssa.OpAMD64VPSHUFDMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked256Merging,
@@ -2805,23 +2805,6 @@
ssa.OpAMD64VPAVGWMasked128,
ssa.OpAMD64VPAVGWMasked256,
ssa.OpAMD64VPAVGWMasked512,
- ssa.OpAMD64VPBROADCASTQMasked128,
- ssa.OpAMD64VBROADCASTSSMasked128,
- ssa.OpAMD64VBROADCASTSDMasked256,
- ssa.OpAMD64VPBROADCASTDMasked128,
- ssa.OpAMD64VPBROADCASTQMasked256,
- ssa.OpAMD64VBROADCASTSSMasked256,
- ssa.OpAMD64VBROADCASTSDMasked512,
- ssa.OpAMD64VPBROADCASTWMasked128,
- ssa.OpAMD64VPBROADCASTDMasked256,
- ssa.OpAMD64VPBROADCASTQMasked512,
- ssa.OpAMD64VBROADCASTSSMasked512,
- ssa.OpAMD64VPBROADCASTBMasked128,
- ssa.OpAMD64VPBROADCASTWMasked256,
- ssa.OpAMD64VPBROADCASTDMasked512,
- ssa.OpAMD64VPBROADCASTBMasked256,
- ssa.OpAMD64VPBROADCASTWMasked512,
- ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VRNDSCALEPSMasked128,
ssa.OpAMD64VRNDSCALEPSMasked128load,
ssa.OpAMD64VRNDSCALEPSMasked256,
@@ -3662,6 +3645,23 @@
ssa.OpAMD64VPXORQMasked256load,
ssa.OpAMD64VPXORQMasked512,
ssa.OpAMD64VPXORQMasked512load,
+ ssa.OpAMD64VPBROADCASTQMasked128,
+ ssa.OpAMD64VBROADCASTSSMasked128,
+ ssa.OpAMD64VBROADCASTSDMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked128,
+ ssa.OpAMD64VPBROADCASTQMasked256,
+ ssa.OpAMD64VBROADCASTSSMasked256,
+ ssa.OpAMD64VBROADCASTSDMasked512,
+ ssa.OpAMD64VPBROADCASTWMasked128,
+ ssa.OpAMD64VPBROADCASTDMasked256,
+ ssa.OpAMD64VPBROADCASTQMasked512,
+ ssa.OpAMD64VBROADCASTSSMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked128,
+ ssa.OpAMD64VPBROADCASTWMasked256,
+ ssa.OpAMD64VPBROADCASTDMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked256,
+ ssa.OpAMD64VPBROADCASTWMasked512,
+ ssa.OpAMD64VPBROADCASTBMasked512,
ssa.OpAMD64VMOVDQU8Masked128,
ssa.OpAMD64VMOVDQU8Masked256,
ssa.OpAMD64VMOVDQU8Masked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 716668b..2e295d9 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -140,36 +140,6 @@
(AverageUint16x8 ...) => (VPAVGW128 ...)
(AverageUint16x16 ...) => (VPAVGW256 ...)
(AverageUint16x32 ...) => (VPAVGW512 ...)
-(Broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...)
-(Broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...)
-(Broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...)
-(Broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...)
-(Broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...)
-(Broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...)
-(Broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...)
-(Broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...)
-(Broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...)
-(Broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...)
-(Broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...)
-(Broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...)
-(Broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...)
-(Broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...)
-(Broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...)
-(Broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...)
-(Broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...)
-(Broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...)
-(Broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...)
(CeilFloat32x4 x) => (VROUNDPS128 [2] x)
(CeilFloat32x8 x) => (VROUNDPS256 [2] x)
(CeilFloat64x2 x) => (VROUNDPD128 [2] x)
@@ -1339,6 +1309,66 @@
(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(broadcast1To2Float64x2 ...) => (VPBROADCASTQ128 ...)
+(broadcast1To2Int64x2 ...) => (VPBROADCASTQ128 ...)
+(broadcast1To2Uint64x2 ...) => (VPBROADCASTQ128 ...)
+(broadcast1To2MaskedFloat64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To2MaskedInt64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To2MaskedUint64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To4Float32x4 ...) => (VBROADCASTSS128 ...)
+(broadcast1To4Float64x2 ...) => (VBROADCASTSD256 ...)
+(broadcast1To4Int32x4 ...) => (VPBROADCASTD128 ...)
+(broadcast1To4Int64x2 ...) => (VPBROADCASTQ256 ...)
+(broadcast1To4Uint32x4 ...) => (VPBROADCASTD128 ...)
+(broadcast1To4Uint64x2 ...) => (VPBROADCASTQ256 ...)
+(broadcast1To4MaskedFloat32x4 x mask) => (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To4MaskedFloat64x2 x mask) => (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To4MaskedInt32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To4MaskedInt64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To4MaskedUint32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To4MaskedUint64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To8Float32x4 ...) => (VBROADCASTSS256 ...)
+(broadcast1To8Float64x2 ...) => (VBROADCASTSD512 ...)
+(broadcast1To8Int16x8 ...) => (VPBROADCASTW128 ...)
+(broadcast1To8Int32x4 ...) => (VPBROADCASTD256 ...)
+(broadcast1To8Int64x2 ...) => (VPBROADCASTQ512 ...)
+(broadcast1To8Uint16x8 ...) => (VPBROADCASTW128 ...)
+(broadcast1To8Uint32x4 ...) => (VPBROADCASTD256 ...)
+(broadcast1To8Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(broadcast1To8MaskedFloat32x4 x mask) => (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To8MaskedFloat64x2 x mask) => (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To8MaskedInt16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To8MaskedInt32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To8MaskedInt64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To8MaskedUint16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To8MaskedUint32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To8MaskedUint64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(broadcast1To16Float32x4 ...) => (VBROADCASTSS512 ...)
+(broadcast1To16Int8x16 ...) => (VPBROADCASTB128 ...)
+(broadcast1To16Int16x8 ...) => (VPBROADCASTW256 ...)
+(broadcast1To16Int32x4 ...) => (VPBROADCASTD512 ...)
+(broadcast1To16Uint8x16 ...) => (VPBROADCASTB128 ...)
+(broadcast1To16Uint16x8 ...) => (VPBROADCASTW256 ...)
+(broadcast1To16Uint32x4 ...) => (VPBROADCASTD512 ...)
+(broadcast1To16MaskedFloat32x4 x mask) => (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To16MaskedInt8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To16MaskedInt16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To16MaskedInt32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To16MaskedUint8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To16MaskedUint16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To16MaskedUint32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(broadcast1To32Int8x16 ...) => (VPBROADCASTB256 ...)
+(broadcast1To32Int16x8 ...) => (VPBROADCASTW512 ...)
+(broadcast1To32Uint8x16 ...) => (VPBROADCASTB256 ...)
+(broadcast1To32Uint16x8 ...) => (VPBROADCASTW512 ...)
+(broadcast1To32MaskedInt8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To32MaskedInt16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To32MaskedUint8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To32MaskedUint16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(broadcast1To64Int8x16 ...) => (VPBROADCASTB512 ...)
+(broadcast1To64Uint8x16 ...) => (VPBROADCASTB512 ...)
+(broadcast1To64MaskedInt8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(broadcast1To64MaskedUint8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...)
(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...)
(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...)
@@ -1496,23 +1526,6 @@
(VMOVDQU16Masked128 (VPAVGW128 x y) mask) => (VPAVGWMasked128 x y mask)
(VMOVDQU16Masked256 (VPAVGW256 x y) mask) => (VPAVGWMasked256 x y mask)
(VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask)
-(VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask) => (VPBROADCASTQMasked128 x mask)
-(VMOVDQU32Masked128 (VBROADCASTSS128 x) mask) => (VBROADCASTSSMasked128 x mask)
-(VMOVDQU64Masked256 (VBROADCASTSD256 x) mask) => (VBROADCASTSDMasked256 x mask)
-(VMOVDQU32Masked128 (VPBROADCASTD128 x) mask) => (VPBROADCASTDMasked128 x mask)
-(VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask) => (VPBROADCASTQMasked256 x mask)
-(VMOVDQU32Masked256 (VBROADCASTSS256 x) mask) => (VBROADCASTSSMasked256 x mask)
-(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask)
-(VMOVDQU16Masked128 (VPBROADCASTW128 x) mask) => (VPBROADCASTWMasked128 x mask)
-(VMOVDQU32Masked256 (VPBROADCASTD256 x) mask) => (VPBROADCASTDMasked256 x mask)
-(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask)
-(VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask)
-(VMOVDQU8Masked128 (VPBROADCASTB128 x) mask) => (VPBROADCASTBMasked128 x mask)
-(VMOVDQU16Masked256 (VPBROADCASTW256 x) mask) => (VPBROADCASTWMasked256 x mask)
-(VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask)
-(VMOVDQU8Masked256 (VPBROADCASTB256 x) mask) => (VPBROADCASTBMasked256 x mask)
-(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask)
-(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask)
(VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask) => (VRNDSCALEPSMasked128 [a] x mask)
(VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask) => (VRNDSCALEPSMasked256 [a] x mask)
(VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index e076b06..44b9283 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -143,36 +143,6 @@
{name: "AverageUint16x8", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AverageUint16x16", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AverageUint16x32", argLength: 2, commutative: true}, // ARCH:amd64
- {name: "Broadcast1To2Float64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To2Int64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To2Uint64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Float32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Float64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Int32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Int64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Uint32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To4Uint64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Float32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Float64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Int16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Int32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Int64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Uint16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Uint32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To8Uint64x2", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Float32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Int8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Int16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Int32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Uint8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Uint16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To16Uint32x4", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Int8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Int16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Uint8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To32Uint16x8", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To64Int8x16", argLength: 1}, // ARCH:amd64
- {name: "Broadcast1To64Uint8x16", argLength: 1}, // ARCH:amd64
{name: "CeilFloat32x4", argLength: 1}, // ARCH:amd64
{name: "CeilFloat32x8", argLength: 1}, // ARCH:amd64
{name: "CeilFloat64x2", argLength: 1}, // ARCH:amd64
@@ -1140,6 +1110,66 @@
{name: "blendMaskedInt16x32", argLength: 3}, // ARCH:amd64
{name: "blendMaskedInt32x16", argLength: 3}, // ARCH:amd64
{name: "blendMaskedInt64x8", argLength: 3}, // ARCH:amd64
+ {name: "broadcast1To2Float64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To2Int64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To2MaskedFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To2MaskedInt64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To2MaskedUint64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To2Uint64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Float32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Float64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Int32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Int64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4MaskedFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedInt64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedUint32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4MaskedUint64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To4Uint32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To4Uint64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Float32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Float64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Int16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Int32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Int64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8MaskedFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedInt64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedUint32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8MaskedUint64x2", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To8Uint16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Uint32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To8Uint64x2", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Float32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Int8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Int16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Int32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16MaskedFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedInt8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedUint8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16MaskedUint32x4", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To16Uint8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Uint16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To16Uint32x4", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32Int8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32Int16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32MaskedInt8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32MaskedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32MaskedUint8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32MaskedUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To32Uint8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To32Uint16x8", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To64Int8x16", argLength: 1}, // ARCH:amd64
+ {name: "broadcast1To64MaskedInt8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To64MaskedUint8x16", argLength: 2}, // ARCH:amd64
+ {name: "broadcast1To64Uint8x16", argLength: 1}, // ARCH:amd64
{name: "AESRoundKeyGenAssistUint32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "CeilScaledFloat32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "CeilScaledFloat32x8", argLength: 1, aux: "UInt8"}, // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 4e52781..50e7df3 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6345,36 +6345,6 @@
OpAverageUint16x8
OpAverageUint16x16
OpAverageUint16x32
- OpBroadcast1To2Float64x2
- OpBroadcast1To2Int64x2
- OpBroadcast1To2Uint64x2
- OpBroadcast1To4Float32x4
- OpBroadcast1To4Float64x2
- OpBroadcast1To4Int32x4
- OpBroadcast1To4Int64x2
- OpBroadcast1To4Uint32x4
- OpBroadcast1To4Uint64x2
- OpBroadcast1To8Float32x4
- OpBroadcast1To8Float64x2
- OpBroadcast1To8Int16x8
- OpBroadcast1To8Int32x4
- OpBroadcast1To8Int64x2
- OpBroadcast1To8Uint16x8
- OpBroadcast1To8Uint32x4
- OpBroadcast1To8Uint64x2
- OpBroadcast1To16Float32x4
- OpBroadcast1To16Int8x16
- OpBroadcast1To16Int16x8
- OpBroadcast1To16Int32x4
- OpBroadcast1To16Uint8x16
- OpBroadcast1To16Uint16x8
- OpBroadcast1To16Uint32x4
- OpBroadcast1To32Int8x16
- OpBroadcast1To32Int16x8
- OpBroadcast1To32Uint8x16
- OpBroadcast1To32Uint16x8
- OpBroadcast1To64Int8x16
- OpBroadcast1To64Uint8x16
OpCeilFloat32x4
OpCeilFloat32x8
OpCeilFloat64x2
@@ -7342,6 +7312,66 @@
OpblendMaskedInt16x32
OpblendMaskedInt32x16
OpblendMaskedInt64x8
+ Opbroadcast1To2Float64x2
+ Opbroadcast1To2Int64x2
+ Opbroadcast1To2MaskedFloat64x2
+ Opbroadcast1To2MaskedInt64x2
+ Opbroadcast1To2MaskedUint64x2
+ Opbroadcast1To2Uint64x2
+ Opbroadcast1To4Float32x4
+ Opbroadcast1To4Float64x2
+ Opbroadcast1To4Int32x4
+ Opbroadcast1To4Int64x2
+ Opbroadcast1To4MaskedFloat32x4
+ Opbroadcast1To4MaskedFloat64x2
+ Opbroadcast1To4MaskedInt32x4
+ Opbroadcast1To4MaskedInt64x2
+ Opbroadcast1To4MaskedUint32x4
+ Opbroadcast1To4MaskedUint64x2
+ Opbroadcast1To4Uint32x4
+ Opbroadcast1To4Uint64x2
+ Opbroadcast1To8Float32x4
+ Opbroadcast1To8Float64x2
+ Opbroadcast1To8Int16x8
+ Opbroadcast1To8Int32x4
+ Opbroadcast1To8Int64x2
+ Opbroadcast1To8MaskedFloat32x4
+ Opbroadcast1To8MaskedFloat64x2
+ Opbroadcast1To8MaskedInt16x8
+ Opbroadcast1To8MaskedInt32x4
+ Opbroadcast1To8MaskedInt64x2
+ Opbroadcast1To8MaskedUint16x8
+ Opbroadcast1To8MaskedUint32x4
+ Opbroadcast1To8MaskedUint64x2
+ Opbroadcast1To8Uint16x8
+ Opbroadcast1To8Uint32x4
+ Opbroadcast1To8Uint64x2
+ Opbroadcast1To16Float32x4
+ Opbroadcast1To16Int8x16
+ Opbroadcast1To16Int16x8
+ Opbroadcast1To16Int32x4
+ Opbroadcast1To16MaskedFloat32x4
+ Opbroadcast1To16MaskedInt8x16
+ Opbroadcast1To16MaskedInt16x8
+ Opbroadcast1To16MaskedInt32x4
+ Opbroadcast1To16MaskedUint8x16
+ Opbroadcast1To16MaskedUint16x8
+ Opbroadcast1To16MaskedUint32x4
+ Opbroadcast1To16Uint8x16
+ Opbroadcast1To16Uint16x8
+ Opbroadcast1To16Uint32x4
+ Opbroadcast1To32Int8x16
+ Opbroadcast1To32Int16x8
+ Opbroadcast1To32MaskedInt8x16
+ Opbroadcast1To32MaskedInt16x8
+ Opbroadcast1To32MaskedUint8x16
+ Opbroadcast1To32MaskedUint16x8
+ Opbroadcast1To32Uint8x16
+ Opbroadcast1To32Uint16x8
+ Opbroadcast1To64Int8x16
+ Opbroadcast1To64MaskedInt8x16
+ Opbroadcast1To64MaskedUint8x16
+ Opbroadcast1To64Uint8x16
OpAESRoundKeyGenAssistUint32x4
OpCeilScaledFloat32x4
OpCeilScaledFloat32x8
@@ -90401,156 +90431,6 @@
generic: true,
},
{
- name: "Broadcast1To2Float64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To2Int64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To2Uint64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Float32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Float64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Int32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Int64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Uint32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To4Uint64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Float32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Float64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Int16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Int32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Int64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Uint16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Uint32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To8Uint64x2",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Float32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Int8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Int16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Int32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Uint8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Uint16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To16Uint32x4",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Int8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Int16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Uint8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To32Uint16x8",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To64Int8x16",
- argLen: 1,
- generic: true,
- },
- {
- name: "Broadcast1To64Uint8x16",
- argLen: 1,
- generic: true,
- },
- {
name: "CeilFloat32x4",
argLen: 1,
generic: true,
@@ -95572,6 +95452,306 @@
generic: true,
},
{
+ name: "broadcast1To2Float64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2Int64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2MaskedFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2MaskedInt64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2MaskedUint64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To2Uint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Float32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Float64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Int32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Int64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedInt64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4MaskedUint64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Uint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To4Uint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Float32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Float64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Int16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Int32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Int64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedInt64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8MaskedUint64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Uint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Uint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To8Uint64x2",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Float32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Int8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Int16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Int32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedInt8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16MaskedUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Uint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Uint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To16Uint32x4",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Int8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Int16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedInt8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32MaskedUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Uint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To32Uint16x8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64Int8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64MaskedInt8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64MaskedUint8x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "broadcast1To64Uint8x16",
+ argLen: 1,
+ generic: true,
+ },
+ {
name: "AESRoundKeyGenAssistUint32x4",
auxType: auxUInt8,
argLen: 1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index e9bb9f1..dc3c553 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2557,96 +2557,6 @@
return rewriteValueAMD64_OpBitLen64(v)
case OpBitLen8:
return rewriteValueAMD64_OpBitLen8(v)
- case OpBroadcast1To16Float32x4:
- v.Op = OpAMD64VBROADCASTSS512
- return true
- case OpBroadcast1To16Int16x8:
- v.Op = OpAMD64VPBROADCASTW256
- return true
- case OpBroadcast1To16Int32x4:
- v.Op = OpAMD64VPBROADCASTD512
- return true
- case OpBroadcast1To16Int8x16:
- v.Op = OpAMD64VPBROADCASTB128
- return true
- case OpBroadcast1To16Uint16x8:
- v.Op = OpAMD64VPBROADCASTW256
- return true
- case OpBroadcast1To16Uint32x4:
- v.Op = OpAMD64VPBROADCASTD512
- return true
- case OpBroadcast1To16Uint8x16:
- v.Op = OpAMD64VPBROADCASTB128
- return true
- case OpBroadcast1To2Float64x2:
- v.Op = OpAMD64VPBROADCASTQ128
- return true
- case OpBroadcast1To2Int64x2:
- v.Op = OpAMD64VPBROADCASTQ128
- return true
- case OpBroadcast1To2Uint64x2:
- v.Op = OpAMD64VPBROADCASTQ128
- return true
- case OpBroadcast1To32Int16x8:
- v.Op = OpAMD64VPBROADCASTW512
- return true
- case OpBroadcast1To32Int8x16:
- v.Op = OpAMD64VPBROADCASTB256
- return true
- case OpBroadcast1To32Uint16x8:
- v.Op = OpAMD64VPBROADCASTW512
- return true
- case OpBroadcast1To32Uint8x16:
- v.Op = OpAMD64VPBROADCASTB256
- return true
- case OpBroadcast1To4Float32x4:
- v.Op = OpAMD64VBROADCASTSS128
- return true
- case OpBroadcast1To4Float64x2:
- v.Op = OpAMD64VBROADCASTSD256
- return true
- case OpBroadcast1To4Int32x4:
- v.Op = OpAMD64VPBROADCASTD128
- return true
- case OpBroadcast1To4Int64x2:
- v.Op = OpAMD64VPBROADCASTQ256
- return true
- case OpBroadcast1To4Uint32x4:
- v.Op = OpAMD64VPBROADCASTD128
- return true
- case OpBroadcast1To4Uint64x2:
- v.Op = OpAMD64VPBROADCASTQ256
- return true
- case OpBroadcast1To64Int8x16:
- v.Op = OpAMD64VPBROADCASTB512
- return true
- case OpBroadcast1To64Uint8x16:
- v.Op = OpAMD64VPBROADCASTB512
- return true
- case OpBroadcast1To8Float32x4:
- v.Op = OpAMD64VBROADCASTSS256
- return true
- case OpBroadcast1To8Float64x2:
- v.Op = OpAMD64VBROADCASTSD512
- return true
- case OpBroadcast1To8Int16x8:
- v.Op = OpAMD64VPBROADCASTW128
- return true
- case OpBroadcast1To8Int32x4:
- v.Op = OpAMD64VPBROADCASTD256
- return true
- case OpBroadcast1To8Int64x2:
- v.Op = OpAMD64VPBROADCASTQ512
- return true
- case OpBroadcast1To8Uint16x8:
- v.Op = OpAMD64VPBROADCASTW128
- return true
- case OpBroadcast1To8Uint32x4:
- v.Op = OpAMD64VPBROADCASTD256
- return true
- case OpBroadcast1To8Uint64x2:
- v.Op = OpAMD64VPBROADCASTQ512
- return true
case OpBswap16:
return rewriteValueAMD64_OpBswap16(v)
case OpBswap32:
@@ -6360,6 +6270,156 @@
return rewriteValueAMD64_OpblendMaskedInt64x8(v)
case OpblendMaskedInt8x64:
return rewriteValueAMD64_OpblendMaskedInt8x64(v)
+ case Opbroadcast1To16Float32x4:
+ v.Op = OpAMD64VBROADCASTSS512
+ return true
+ case Opbroadcast1To16Int16x8:
+ v.Op = OpAMD64VPBROADCASTW256
+ return true
+ case Opbroadcast1To16Int32x4:
+ v.Op = OpAMD64VPBROADCASTD512
+ return true
+ case Opbroadcast1To16Int8x16:
+ v.Op = OpAMD64VPBROADCASTB128
+ return true
+ case Opbroadcast1To16MaskedFloat32x4:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedFloat32x4(v)
+ case Opbroadcast1To16MaskedInt16x8:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedInt16x8(v)
+ case Opbroadcast1To16MaskedInt32x4:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedInt32x4(v)
+ case Opbroadcast1To16MaskedInt8x16:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedInt8x16(v)
+ case Opbroadcast1To16MaskedUint16x8:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedUint16x8(v)
+ case Opbroadcast1To16MaskedUint32x4:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedUint32x4(v)
+ case Opbroadcast1To16MaskedUint8x16:
+ return rewriteValueAMD64_Opbroadcast1To16MaskedUint8x16(v)
+ case Opbroadcast1To16Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW256
+ return true
+ case Opbroadcast1To16Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD512
+ return true
+ case Opbroadcast1To16Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB128
+ return true
+ case Opbroadcast1To2Float64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
+ return true
+ case Opbroadcast1To2Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
+ return true
+ case Opbroadcast1To2MaskedFloat64x2:
+ return rewriteValueAMD64_Opbroadcast1To2MaskedFloat64x2(v)
+ case Opbroadcast1To2MaskedInt64x2:
+ return rewriteValueAMD64_Opbroadcast1To2MaskedInt64x2(v)
+ case Opbroadcast1To2MaskedUint64x2:
+ return rewriteValueAMD64_Opbroadcast1To2MaskedUint64x2(v)
+ case Opbroadcast1To2Uint64x2:
+ v.Op = OpAMD64VPBROADCASTQ128
+ return true
+ case Opbroadcast1To32Int16x8:
+ v.Op = OpAMD64VPBROADCASTW512
+ return true
+ case Opbroadcast1To32Int8x16:
+ v.Op = OpAMD64VPBROADCASTB256
+ return true
+ case Opbroadcast1To32MaskedInt16x8:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedInt16x8(v)
+ case Opbroadcast1To32MaskedInt8x16:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedInt8x16(v)
+ case Opbroadcast1To32MaskedUint16x8:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedUint16x8(v)
+ case Opbroadcast1To32MaskedUint8x16:
+ return rewriteValueAMD64_Opbroadcast1To32MaskedUint8x16(v)
+ case Opbroadcast1To32Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW512
+ return true
+ case Opbroadcast1To32Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB256
+ return true
+ case Opbroadcast1To4Float32x4:
+ v.Op = OpAMD64VBROADCASTSS128
+ return true
+ case Opbroadcast1To4Float64x2:
+ v.Op = OpAMD64VBROADCASTSD256
+ return true
+ case Opbroadcast1To4Int32x4:
+ v.Op = OpAMD64VPBROADCASTD128
+ return true
+ case Opbroadcast1To4Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ256
+ return true
+ case Opbroadcast1To4MaskedFloat32x4:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedFloat32x4(v)
+ case Opbroadcast1To4MaskedFloat64x2:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedFloat64x2(v)
+ case Opbroadcast1To4MaskedInt32x4:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedInt32x4(v)
+ case Opbroadcast1To4MaskedInt64x2:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedInt64x2(v)
+ case Opbroadcast1To4MaskedUint32x4:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedUint32x4(v)
+ case Opbroadcast1To4MaskedUint64x2:
+ return rewriteValueAMD64_Opbroadcast1To4MaskedUint64x2(v)
+ case Opbroadcast1To4Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD128
+ return true
+ case Opbroadcast1To4Uint64x2:
+ v.Op = OpAMD64VPBROADCASTQ256
+ return true
+ case Opbroadcast1To64Int8x16:
+ v.Op = OpAMD64VPBROADCASTB512
+ return true
+ case Opbroadcast1To64MaskedInt8x16:
+ return rewriteValueAMD64_Opbroadcast1To64MaskedInt8x16(v)
+ case Opbroadcast1To64MaskedUint8x16:
+ return rewriteValueAMD64_Opbroadcast1To64MaskedUint8x16(v)
+ case Opbroadcast1To64Uint8x16:
+ v.Op = OpAMD64VPBROADCASTB512
+ return true
+ case Opbroadcast1To8Float32x4:
+ v.Op = OpAMD64VBROADCASTSS256
+ return true
+ case Opbroadcast1To8Float64x2:
+ v.Op = OpAMD64VBROADCASTSD512
+ return true
+ case Opbroadcast1To8Int16x8:
+ v.Op = OpAMD64VPBROADCASTW128
+ return true
+ case Opbroadcast1To8Int32x4:
+ v.Op = OpAMD64VPBROADCASTD256
+ return true
+ case Opbroadcast1To8Int64x2:
+ v.Op = OpAMD64VPBROADCASTQ512
+ return true
+ case Opbroadcast1To8MaskedFloat32x4:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedFloat32x4(v)
+ case Opbroadcast1To8MaskedFloat64x2:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedFloat64x2(v)
+ case Opbroadcast1To8MaskedInt16x8:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedInt16x8(v)
+ case Opbroadcast1To8MaskedInt32x4:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedInt32x4(v)
+ case Opbroadcast1To8MaskedInt64x2:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedInt64x2(v)
+ case Opbroadcast1To8MaskedUint16x8:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedUint16x8(v)
+ case Opbroadcast1To8MaskedUint32x4:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedUint32x4(v)
+ case Opbroadcast1To8MaskedUint64x2:
+ return rewriteValueAMD64_Opbroadcast1To8MaskedUint64x2(v)
+ case Opbroadcast1To8Uint16x8:
+ v.Op = OpAMD64VPBROADCASTW128
+ return true
+ case Opbroadcast1To8Uint32x4:
+ v.Op = OpAMD64VPBROADCASTD256
+ return true
+ case Opbroadcast1To8Uint64x2:
+ v.Op = OpAMD64VPBROADCASTQ512
+ return true
case OpcarrylessMultiplyUint64x2:
v.Op = OpAMD64VPCLMULQDQ128
return true
@@ -33324,18 +33384,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU16Masked128 (VPBROADCASTW128 x) mask)
- // result: (VPBROADCASTWMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTW128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTWMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
// result: (VPERMI2WMasked128 x y z mask)
for {
@@ -33890,18 +33938,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU16Masked256 (VPBROADCASTW256 x) mask)
- // result: (VPBROADCASTWMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTW256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTWMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
// result: (VPERMI2WMasked256 x y z mask)
for {
@@ -34492,18 +34528,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU16Masked512 (VPBROADCASTW512 x) mask)
- // result: (VPBROADCASTWMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTW512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTWMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
// result: (VPERMI2WMasked512 x y z mask)
for {
@@ -34996,30 +35020,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked128 (VBROADCASTSS128 x) mask)
- // result: (VBROADCASTSSMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSS128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSSMasked128)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU32Masked128 (VPBROADCASTD128 x) mask)
- // result: (VPBROADCASTDMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTD128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTDMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked128 (VRNDSCALEPS128 [a] x) mask)
// result: (VRNDSCALEPSMasked128 [a] x mask)
for {
@@ -35769,30 +35769,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked256 (VBROADCASTSS256 x) mask)
- // result: (VBROADCASTSSMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSS256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSSMasked256)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU32Masked256 (VPBROADCASTD256 x) mask)
- // result: (VPBROADCASTDMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTD256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTDMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked256 (VRNDSCALEPS256 [a] x) mask)
// result: (VRNDSCALEPSMasked256 [a] x mask)
for {
@@ -36690,30 +36666,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked512 (VBROADCASTSS512 x) mask)
- // result: (VBROADCASTSSMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSS512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSSMasked512)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU32Masked512 (VPBROADCASTD512 x) mask)
- // result: (VPBROADCASTDMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTD512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTDMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask)
// result: (VRNDSCALEPSMasked512 [a] x mask)
for {
@@ -37563,18 +37515,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked128 (VPBROADCASTQ128 x) mask)
- // result: (VPBROADCASTQMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTQ128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTQMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU64Masked128 (VRNDSCALEPD128 [a] x) mask)
// result: (VRNDSCALEPDMasked128 [a] x mask)
for {
@@ -38440,30 +38380,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked256 (VBROADCASTSD256 x) mask)
- // result: (VBROADCASTSDMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSD256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSDMasked256)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU64Masked256 (VPBROADCASTQ256 x) mask)
- // result: (VPBROADCASTQMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTQ256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTQMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU64Masked256 (VRNDSCALEPD256 [a] x) mask)
// result: (VRNDSCALEPDMasked256 [a] x mask)
for {
@@ -39357,30 +39273,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked512 (VBROADCASTSD512 x) mask)
- // result: (VBROADCASTSDMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VBROADCASTSD512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VBROADCASTSDMasked512)
- v.AddArg2(x, mask)
- return true
- }
- // match: (VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask)
- // result: (VPBROADCASTQMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTQ512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTQMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU64Masked512 (VRNDSCALEPD512 [a] x) mask)
// result: (VRNDSCALEPDMasked512 [a] x mask)
for {
@@ -40168,18 +40060,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked128 (VPBROADCASTB128 x) mask)
- // result: (VPBROADCASTBMasked128 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTB128 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTBMasked128)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
// result: (VPERMI2BMasked128 x y z mask)
for {
@@ -40522,18 +40402,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked256 (VPBROADCASTB256 x) mask)
- // result: (VPBROADCASTBMasked256 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTB256 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTBMasked256)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
// result: (VPERMI2BMasked256 x y z mask)
for {
@@ -40876,18 +40744,6 @@
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked512 (VPBROADCASTB512 x) mask)
- // result: (VPBROADCASTBMasked512 x mask)
- for {
- if v_0.Op != OpAMD64VPBROADCASTB512 {
- break
- }
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPBROADCASTBMasked512)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
// result: (VPERMI2BMasked512 x y z mask)
for {
@@ -79558,6 +79414,486 @@
return true
}
}
+func rewriteValueAMD64_Opbroadcast1To16MaskedFloat32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedFloat32x4 x mask)
+ // result: (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSSMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedInt16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedInt16x8 x mask)
+ // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedInt32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedInt32x4 x mask)
+ // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedInt8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedInt8x16 x mask)
+ // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedUint16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedUint16x8 x mask)
+ // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedUint32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedUint32x4 x mask)
+ // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To16MaskedUint8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To16MaskedUint8x16 x mask)
+ // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To2MaskedFloat64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To2MaskedFloat64x2 x mask)
+ // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To2MaskedInt64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To2MaskedInt64x2 x mask)
+ // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To2MaskedUint64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To2MaskedUint64x2 x mask)
+ // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedInt16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedInt16x8 x mask)
+ // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedInt8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedInt8x16 x mask)
+ // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedUint16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedUint16x8 x mask)
+ // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To32MaskedUint8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To32MaskedUint8x16 x mask)
+ // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedFloat32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedFloat32x4 x mask)
+ // result: (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSSMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedFloat64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedFloat64x2 x mask)
+ // result: (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedInt32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedInt32x4 x mask)
+ // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedInt64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedInt64x2 x mask)
+ // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedUint32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedUint32x4 x mask)
+ // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To4MaskedUint64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To4MaskedUint64x2 x mask)
+ // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To64MaskedInt8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To64MaskedInt8x16 x mask)
+ // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To64MaskedUint8x16(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To64MaskedUint8x16 x mask)
+ // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTBMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedFloat32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedFloat32x4 x mask)
+ // result: (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSSMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedFloat64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedFloat64x2 x mask)
+ // result: (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VBROADCASTSDMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedInt16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedInt16x8 x mask)
+ // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedInt32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedInt32x4 x mask)
+ // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedInt64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedInt64x2 x mask)
+ // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedUint16x8(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedUint16x8 x mask)
+ // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTWMasked128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedUint32x4(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedUint32x4 x mask)
+ // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTDMasked256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
+func rewriteValueAMD64_Opbroadcast1To8MaskedUint64x2(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (broadcast1To8MaskedUint64x2 x mask)
+ // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+ for {
+ x := v_0
+ mask := v_1
+ v.reset(OpAMD64VPBROADCASTQMasked512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg2(x, v0)
+ return true
+ }
+}
func rewriteBlockAMD64(b *Block) bool {
typ := &b.Func.Config.Types
switch b.Kind {
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index cc298c0..5884c31 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -150,36 +150,6 @@
addF(simdPackage, "Uint16x8.Average", opLen2(ssa.OpAverageUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.Average", opLen2(ssa.OpAverageUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.Average", opLen2(ssa.OpAverageUint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Float64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Int64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast1To2", opLen1(ssa.OpBroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Float64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Int64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast1To4", opLen1(ssa.OpBroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Float64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Int64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint64x2.Broadcast1To8", opLen1(ssa.OpBroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Float32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int16x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Int32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x4.Broadcast1To16", opLen1(ssa.OpBroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int8x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Int16x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x8.Broadcast1To32", opLen1(ssa.OpBroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Int8x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.Broadcast1To64", opLen1(ssa.OpBroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
@@ -1295,6 +1265,66 @@
addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To2", opLen1(ssa.Opbroadcast1To2Float64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To2", opLen1(ssa.Opbroadcast1To2Int64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To2", opLen1(ssa.Opbroadcast1To2Uint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To2Masked", opLen2(ssa.Opbroadcast1To2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To2Masked", opLen2(ssa.Opbroadcast1To2MaskedInt64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To2Masked", opLen2(ssa.Opbroadcast1To2MaskedUint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To4", opLen1(ssa.Opbroadcast1To4Float32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To4", opLen1(ssa.Opbroadcast1To4Float64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To4", opLen1(ssa.Opbroadcast1To4Int32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To4", opLen1(ssa.Opbroadcast1To4Int64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To4", opLen1(ssa.Opbroadcast1To4Uint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To4", opLen1(ssa.Opbroadcast1To4Uint64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedFloat32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedFloat64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedInt64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedUint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To4Masked", opLen2(ssa.Opbroadcast1To4MaskedUint64x2, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To8", opLen1(ssa.Opbroadcast1To8Float32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To8", opLen1(ssa.Opbroadcast1To8Float64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To8", opLen1(ssa.Opbroadcast1To8Int16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To8", opLen1(ssa.Opbroadcast1To8Int32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To8", opLen1(ssa.Opbroadcast1To8Int64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To8", opLen1(ssa.Opbroadcast1To8Uint64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedFloat32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x2.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedFloat64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedInt32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x2.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedInt64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedUint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedUint32x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x2.broadcast1To8Masked", opLen2(ssa.Opbroadcast1To8MaskedUint64x2, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To16", opLen1(ssa.Opbroadcast1To16Float32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To16", opLen1(ssa.Opbroadcast1To16Int32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To16", opLen1(ssa.Opbroadcast1To16Uint32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedFloat32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedInt8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedInt16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x4.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedInt32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedUint16x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x4.broadcast1To16Masked", opLen2(ssa.Opbroadcast1To16MaskedUint32x4, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To32", opLen1(ssa.Opbroadcast1To32Int8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To32", opLen1(ssa.Opbroadcast1To32Int16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To32", opLen1(ssa.Opbroadcast1To32Uint8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To32", opLen1(ssa.Opbroadcast1To32Uint16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedInt8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedInt16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedUint8x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x8.broadcast1To32Masked", opLen2(ssa.Opbroadcast1To32MaskedUint16x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To64", opLen1(ssa.Opbroadcast1To64Int8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To64", opLen1(ssa.Opbroadcast1To64Uint8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.broadcast1To64Masked", opLen2(ssa.Opbroadcast1To64MaskedInt8x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.broadcast1To64Masked", opLen2(ssa.Opbroadcast1To64MaskedUint8x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 3f8489c..70c8178 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -70,32 +70,32 @@
commutative: false
documentation: !string |-
// NAME expands the lower elements of x into the masked elements of z.
-- go: Broadcast1To2
+- go: broadcast1To2
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 2 elements of
// the output vector.
-- go: Broadcast1To4
+- go: broadcast1To4
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 4 elements of
// the output vector.
-- go: Broadcast1To8
+- go: broadcast1To8
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 8 elements of
// the output vector.
-- go: Broadcast1To16
+- go: broadcast1To16
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 16 elements of
// the output vector.
-- go: Broadcast1To32
+- go: broadcast1To32
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 32 elements of
// the output vector.
-- go: Broadcast1To64
+- go: broadcast1To64
commutative: false
documentation: !string |-
// NAME copies the lowest element of its input to all 64 elements of
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index a709c3d..1c0e371 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -347,7 +347,7 @@
out:
- go: $t
-- go: Broadcast1To2
+- go: broadcast1To2
asm: VPBROADCASTQ
in:
- class: vreg
@@ -361,7 +361,7 @@
base: $b
# weirdly, this one case on AVX2 is memory-operand-only
-- go: Broadcast1To2
+- go: broadcast1To2
asm: VPBROADCASTQ
in:
- class: vreg
@@ -376,7 +376,7 @@
base: int
OverwriteBase: float
-- go: Broadcast1To4
+- go: broadcast1To4
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -387,7 +387,7 @@
lanes: 4
base: $b
-- go: Broadcast1To8
+- go: broadcast1To8
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -398,7 +398,7 @@
lanes: 8
base: $b
-- go: Broadcast1To16
+- go: broadcast1To16
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -409,7 +409,7 @@
lanes: 16
base: $b
-- go: Broadcast1To32
+- go: broadcast1To32
asm: VPBROADCAST[BWDQ]
in:
- class: vreg
@@ -420,7 +420,7 @@
lanes: 32
base: $b
-- go: Broadcast1To64
+- go: broadcast1To64
asm: VPBROADCASTB
in:
- class: vreg
@@ -431,7 +431,7 @@
lanes: 64
base: $b
-- go: Broadcast1To4
+- go: broadcast1To4
asm: VBROADCASTS[SD]
in:
- class: vreg
@@ -442,7 +442,7 @@
lanes: 4
base: float
-- go: Broadcast1To8
+- go: broadcast1To8
asm: VBROADCASTS[SD]
in:
- class: vreg
@@ -453,7 +453,7 @@
lanes: 8
base: float
-- go: Broadcast1To16
+- go: broadcast1To16
asm: VBROADCASTS[SD]
in:
- class: vreg
diff --git a/src/simd/archsimd/_gen/tmplgen/main.go b/src/simd/archsimd/_gen/tmplgen/main.go
index d6708eb..20db7c9 100644
--- a/src/simd/archsimd/_gen/tmplgen/main.go
+++ b/src/simd/archsimd/_gen/tmplgen/main.go
@@ -898,7 +898,7 @@
// Emulated, CPU Feature: {{.CPUfeatureBC}}
func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
var z {{.As128BitVec }}
- return z.SetElem(0, x).Broadcast1To{{.Count}}()
+ return z.SetElem(0, x).broadcast1To{{.Count}}()
}
`)
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index d17c1d9..2ee694c 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -805,198 +805,6 @@
// Asm: VPAVGW, CPU Feature: AVX512
func (x Uint16x32) Average(y Uint16x32) Uint16x32
-/* Broadcast1To2 */
-
-// Broadcast1To2 copies the lowest element of its input to all 2 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Float64x2) Broadcast1To2() Float64x2
-
-// Broadcast1To2 copies the lowest element of its input to all 2 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast1To2() Int64x2
-
-// Broadcast1To2 copies the lowest element of its input to all 2 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast1To2() Uint64x2
-
-/* Broadcast1To4 */
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast1To4() Float32x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX2
-func (x Float64x2) Broadcast1To4() Float64x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast1To4() Int32x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast1To4() Int64x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast1To4() Uint32x4
-
-// Broadcast1To4 copies the lowest element of its input to all 4 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast1To4() Uint64x4
-
-/* Broadcast1To8 */
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast1To8() Float32x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX512
-func (x Float64x2) Broadcast1To8() Float64x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast1To8() Int16x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast1To8() Int32x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Int64x2) Broadcast1To8() Int64x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast1To8() Uint16x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast1To8() Uint32x8
-
-// Broadcast1To8 copies the lowest element of its input to all 8 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Uint64x2) Broadcast1To8() Uint64x8
-
-/* Broadcast1To16 */
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX512
-func (x Float32x4) Broadcast1To16() Float32x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast1To16() Int8x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast1To16() Int16x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Int32x4) Broadcast1To16() Int32x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast1To16() Uint8x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast1To16() Uint16x16
-
-// Broadcast1To16 copies the lowest element of its input to all 16 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Uint32x4) Broadcast1To16() Uint32x16
-
-/* Broadcast1To32 */
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast1To32() Int8x32
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Int16x8) Broadcast1To32() Int16x32
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast1To32() Uint8x32
-
-// Broadcast1To32 copies the lowest element of its input to all 32 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Uint16x8) Broadcast1To32() Uint16x32
-
-/* Broadcast1To64 */
-
-// Broadcast1To64 copies the lowest element of its input to all 64 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Int8x16) Broadcast1To64() Int8x64
-
-// Broadcast1To64 copies the lowest element of its input to all 64 elements of
-// the output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Uint8x16) Broadcast1To64() Uint8x64
-
/* Ceil */
// Ceil rounds elements up to the nearest integer.
diff --git a/src/simd/archsimd/ops_internal_amd64.go b/src/simd/archsimd/ops_internal_amd64.go
index 10749c9..d61f442 100644
--- a/src/simd/archsimd/ops_internal_amd64.go
+++ b/src/simd/archsimd/ops_internal_amd64.go
@@ -52,6 +52,450 @@
// Asm: VPBLENDMQ, CPU Feature: AVX512
func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
+/* broadcast1To2 */
+
+// broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Float64x2) broadcast1To2() Float64x2
+
+// broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) broadcast1To2() Int64x2
+
+// broadcast1To2 copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) broadcast1To2() Uint64x2
+
+/* broadcast1To2Masked */
+
+// broadcast1To2Masked copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Float64x2) broadcast1To2Masked(mask Mask64x2) Float64x2
+
+// broadcast1To2Masked copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To2Masked(mask Mask64x2) Int64x2
+
+// broadcast1To2Masked copies the lowest element of its input to all 2 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To2Masked(mask Mask64x2) Uint64x2
+
+/* broadcast1To4 */
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) broadcast1To4() Float32x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX2
+func (x Float64x2) broadcast1To4() Float64x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) broadcast1To4() Int32x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) broadcast1To4() Int64x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) broadcast1To4() Uint32x4
+
+// broadcast1To4 copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) broadcast1To4() Uint64x4
+
+/* broadcast1To4Masked */
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To4Masked(mask Mask32x4) Float32x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) broadcast1To4Masked(mask Mask64x2) Float64x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To4Masked(mask Mask32x4) Int32x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To4Masked(mask Mask64x2) Int64x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To4Masked(mask Mask32x4) Uint32x4
+
+// broadcast1To4Masked copies the lowest element of its input to all 4 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To4Masked(mask Mask64x2) Uint64x4
+
+/* broadcast1To8 */
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) broadcast1To8() Float32x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) broadcast1To8() Float64x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) broadcast1To8() Int16x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) broadcast1To8() Int32x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To8() Int64x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) broadcast1To8() Uint16x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) broadcast1To8() Uint32x8
+
+// broadcast1To8 copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To8() Uint64x8
+
+/* broadcast1To8Masked */
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To8Masked(mask Mask32x4) Float32x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) broadcast1To8Masked(mask Mask64x2) Float64x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To8Masked(mask Mask16x8) Int16x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To8Masked(mask Mask32x4) Int32x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) broadcast1To8Masked(mask Mask64x2) Int64x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To8Masked(mask Mask16x8) Uint16x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To8Masked(mask Mask32x4) Uint32x8
+
+// broadcast1To8Masked copies the lowest element of its input to all 8 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) broadcast1To8Masked(mask Mask64x2) Uint64x8
+
+/* broadcast1To16 */
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To16() Float32x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) broadcast1To16() Int8x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) broadcast1To16() Int16x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To16() Int32x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) broadcast1To16() Uint8x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) broadcast1To16() Uint16x16
+
+// broadcast1To16 copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To16() Uint32x16
+
+/* broadcast1To16Masked */
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) broadcast1To16Masked(mask Mask32x4) Float32x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To16Masked(mask Mask8x16) Int8x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To16Masked(mask Mask16x8) Int16x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) broadcast1To16Masked(mask Mask32x4) Int32x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To16Masked(mask Mask8x16) Uint8x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To16Masked(mask Mask16x8) Uint16x16
+
+// broadcast1To16Masked copies the lowest element of its input to all 16 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) broadcast1To16Masked(mask Mask32x4) Uint32x16
+
+/* broadcast1To32 */
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) broadcast1To32() Int8x32
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To32() Int16x32
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) broadcast1To32() Uint8x32
+
+// broadcast1To32 copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To32() Uint16x32
+
+/* broadcast1To32Masked */
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To32Masked(mask Mask8x16) Int8x32
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) broadcast1To32Masked(mask Mask16x8) Int16x32
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To32Masked(mask Mask8x16) Uint8x32
+
+// broadcast1To32Masked copies the lowest element of its input to all 32 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) broadcast1To32Masked(mask Mask16x8) Uint16x32
+
+/* broadcast1To64 */
+
+// broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To64() Int8x64
+
+// broadcast1To64 copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To64() Uint8x64
+
+/* broadcast1To64Masked */
+
+// broadcast1To64Masked copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) broadcast1To64Masked(mask Mask8x16) Int8x64
+
+// broadcast1To64Masked copies the lowest element of its input to all 64 elements of
+// the output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) broadcast1To64Masked(mask Mask8x16) Uint8x64
+
/* carrylessMultiply */
// carrylessMultiply computes one of four possible Galois polynomial
diff --git a/src/simd/archsimd/other_gen_amd64.go b/src/simd/archsimd/other_gen_amd64.go
index 61b71e3..6ece31e 100644
--- a/src/simd/archsimd/other_gen_amd64.go
+++ b/src/simd/archsimd/other_gen_amd64.go
@@ -10,7 +10,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt8x16(x int8) Int8x16 {
var z Int8x16
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastInt16x8 returns a vector with the input
@@ -19,7 +19,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt16x8(x int16) Int16x8 {
var z Int16x8
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastInt32x4 returns a vector with the input
@@ -28,7 +28,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt32x4(x int32) Int32x4 {
var z Int32x4
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}
// BroadcastInt64x2 returns a vector with the input
@@ -37,7 +37,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt64x2(x int64) Int64x2 {
var z Int64x2
- return z.SetElem(0, x).Broadcast1To2()
+ return z.SetElem(0, x).broadcast1To2()
}
// BroadcastUint8x16 returns a vector with the input
@@ -46,7 +46,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint8x16(x uint8) Uint8x16 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastUint16x8 returns a vector with the input
@@ -55,7 +55,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint16x8(x uint16) Uint16x8 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastUint32x4 returns a vector with the input
@@ -64,7 +64,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint32x4(x uint32) Uint32x4 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}
// BroadcastUint64x2 returns a vector with the input
@@ -73,7 +73,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint64x2(x uint64) Uint64x2 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast1To2()
+ return z.SetElem(0, x).broadcast1To2()
}
// BroadcastFloat32x4 returns a vector with the input
@@ -82,7 +82,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat32x4(x float32) Float32x4 {
var z Float32x4
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}
// BroadcastFloat64x2 returns a vector with the input
@@ -91,7 +91,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat64x2(x float64) Float64x2 {
var z Float64x2
- return z.SetElem(0, x).Broadcast1To2()
+ return z.SetElem(0, x).broadcast1To2()
}
// BroadcastInt8x32 returns a vector with the input
@@ -100,7 +100,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt8x32(x int8) Int8x32 {
var z Int8x16
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}
// BroadcastInt16x16 returns a vector with the input
@@ -109,7 +109,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt16x16(x int16) Int16x16 {
var z Int16x8
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastInt32x8 returns a vector with the input
@@ -118,7 +118,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt32x8(x int32) Int32x8 {
var z Int32x4
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastInt64x4 returns a vector with the input
@@ -127,7 +127,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastInt64x4(x int64) Int64x4 {
var z Int64x2
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}
// BroadcastUint8x32 returns a vector with the input
@@ -136,7 +136,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint8x32(x uint8) Uint8x32 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}
// BroadcastUint16x16 returns a vector with the input
@@ -145,7 +145,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint16x16(x uint16) Uint16x16 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastUint32x8 returns a vector with the input
@@ -154,7 +154,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint32x8(x uint32) Uint32x8 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastUint64x4 returns a vector with the input
@@ -163,7 +163,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastUint64x4(x uint64) Uint64x4 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}
// BroadcastFloat32x8 returns a vector with the input
@@ -172,7 +172,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat32x8(x float32) Float32x8 {
var z Float32x4
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastFloat64x4 returns a vector with the input
@@ -181,7 +181,7 @@
// Emulated, CPU Feature: AVX2
func BroadcastFloat64x4(x float64) Float64x4 {
var z Float64x2
- return z.SetElem(0, x).Broadcast1To4()
+ return z.SetElem(0, x).broadcast1To4()
}
// BroadcastInt8x64 returns a vector with the input
@@ -190,7 +190,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastInt8x64(x int8) Int8x64 {
var z Int8x16
- return z.SetElem(0, x).Broadcast1To64()
+ return z.SetElem(0, x).broadcast1To64()
}
// BroadcastInt16x32 returns a vector with the input
@@ -199,7 +199,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastInt16x32(x int16) Int16x32 {
var z Int16x8
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}
// BroadcastInt32x16 returns a vector with the input
@@ -208,7 +208,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastInt32x16(x int32) Int32x16 {
var z Int32x4
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastInt64x8 returns a vector with the input
@@ -217,7 +217,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastInt64x8(x int64) Int64x8 {
var z Int64x2
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastUint8x64 returns a vector with the input
@@ -226,7 +226,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastUint8x64(x uint8) Uint8x64 {
var z Uint8x16
- return z.SetElem(0, x).Broadcast1To64()
+ return z.SetElem(0, x).broadcast1To64()
}
// BroadcastUint16x32 returns a vector with the input
@@ -235,7 +235,7 @@
// Emulated, CPU Feature: AVX512BW
func BroadcastUint16x32(x uint16) Uint16x32 {
var z Uint16x8
- return z.SetElem(0, x).Broadcast1To32()
+ return z.SetElem(0, x).broadcast1To32()
}
// BroadcastUint32x16 returns a vector with the input
@@ -244,7 +244,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastUint32x16(x uint32) Uint32x16 {
var z Uint32x4
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastUint64x8 returns a vector with the input
@@ -253,7 +253,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastUint64x8(x uint64) Uint64x8 {
var z Uint64x2
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// BroadcastFloat32x16 returns a vector with the input
@@ -262,7 +262,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastFloat32x16(x float32) Float32x16 {
var z Float32x4
- return z.SetElem(0, x).Broadcast1To16()
+ return z.SetElem(0, x).broadcast1To16()
}
// BroadcastFloat64x8 returns a vector with the input
@@ -271,7 +271,7 @@
// Emulated, CPU Feature: AVX512F
func BroadcastFloat64x8(x float64) Float64x8 {
var z Float64x2
- return z.SetElem(0, x).Broadcast1To8()
+ return z.SetElem(0, x).broadcast1To8()
}
// ToMask returns a mask whose i'th element is set if x[i] is non-zero.