diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 7659601..618a7ad 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1425,9 +1425,6 @@
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512,
- ssa.OpAMD64VPALIGNRMasked256Merging,
- ssa.OpAMD64VPALIGNRMasked512Merging,
- ssa.OpAMD64VPALIGNRMasked128Merging,
ssa.OpAMD64VDIVPSMasked128Merging,
ssa.OpAMD64VDIVPSMasked256Merging,
ssa.OpAMD64VDIVPSMasked512Merging,
@@ -1575,24 +1572,6 @@
ssa.OpAMD64VSCALEFPDMasked128Merging,
ssa.OpAMD64VSCALEFPDMasked256Merging,
ssa.OpAMD64VSCALEFPDMasked512Merging,
- ssa.OpAMD64VPSHLDWMasked128Merging,
- ssa.OpAMD64VPSHLDWMasked256Merging,
- ssa.OpAMD64VPSHLDWMasked512Merging,
- ssa.OpAMD64VPSHLDDMasked128Merging,
- ssa.OpAMD64VPSHLDDMasked256Merging,
- ssa.OpAMD64VPSHLDDMasked512Merging,
- ssa.OpAMD64VPSHLDQMasked128Merging,
- ssa.OpAMD64VPSHLDQMasked256Merging,
- ssa.OpAMD64VPSHLDQMasked512Merging,
- ssa.OpAMD64VPSHRDWMasked128Merging,
- ssa.OpAMD64VPSHRDWMasked256Merging,
- ssa.OpAMD64VPSHRDWMasked512Merging,
- ssa.OpAMD64VPSHRDDMasked128Merging,
- ssa.OpAMD64VPSHRDDMasked256Merging,
- ssa.OpAMD64VPSHRDDMasked512Merging,
- ssa.OpAMD64VPSHRDQMasked128Merging,
- ssa.OpAMD64VPSHRDQMasked256Merging,
- ssa.OpAMD64VPSHRDQMasked512Merging,
ssa.OpAMD64VPSHLDVWMasked128,
ssa.OpAMD64VPSHLDVWMasked256,
ssa.OpAMD64VPSHLDVWMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 2e295d9..ff76cc0 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -2068,7 +2068,6 @@
(VPBLENDMBMasked512 dst (VPADDB512 x y) mask) => (VPADDBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPADDSB512 x y) mask) => (VPADDSBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPADDUSB512 x y) mask) => (VPADDUSBMasked512Merging dst x y mask)
-(VPBLENDMBMasked512 dst (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512Merging dst [a] x y mask)
(VPBLENDMBMasked512 dst (VPAVGB512 x y) mask) => (VPAVGBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPMAXSB512 x y) mask) => (VPMAXSBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPMAXUB512 x y) mask) => (VPMAXUBMasked512Merging dst x y mask)
@@ -2111,8 +2110,6 @@
(VPBLENDMDMasked512 dst (VPROLVD512 x y) mask) => (VPROLVDMasked512Merging dst x y mask)
(VPBLENDMDMasked512 dst (VPRORD512 [a] x) mask) => (VPRORDMasked512Merging dst [a] x mask)
(VPBLENDMDMasked512 dst (VPRORVD512 x y) mask) => (VPRORVDMasked512Merging dst x y mask)
-(VPBLENDMDMasked512 dst (VPSHLDD512 [a] x y) mask) => (VPSHLDDMasked512Merging dst [a] x y mask)
-(VPBLENDMDMasked512 dst (VPSHRDD512 [a] x y) mask) => (VPSHRDDMasked512Merging dst [a] x y mask)
(VPBLENDMDMasked512 dst (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512Merging dst [a] x mask)
(VPBLENDMDMasked512 dst (VPSLLD512const [a] x) mask) => (VPSLLDMasked512constMerging dst [a] x mask)
(VPBLENDMDMasked512 dst (VPSLLVD512 x y) mask) => (VPSLLVDMasked512Merging dst x y mask)
@@ -2167,8 +2164,6 @@
(VPBLENDMQMasked512 dst (VPROLVQ512 x y) mask) => (VPROLVQMasked512Merging dst x y mask)
(VPBLENDMQMasked512 dst (VPRORQ512 [a] x) mask) => (VPRORQMasked512Merging dst [a] x mask)
(VPBLENDMQMasked512 dst (VPRORVQ512 x y) mask) => (VPRORVQMasked512Merging dst x y mask)
-(VPBLENDMQMasked512 dst (VPSHLDQ512 [a] x y) mask) => (VPSHLDQMasked512Merging dst [a] x y mask)
-(VPBLENDMQMasked512 dst (VPSHRDQ512 [a] x y) mask) => (VPSHRDQMasked512Merging dst [a] x y mask)
(VPBLENDMQMasked512 dst (VPSLLQ512const [a] x) mask) => (VPSLLQMasked512constMerging dst [a] x mask)
(VPBLENDMQMasked512 dst (VPSLLVQ512 x y) mask) => (VPSLLVQMasked512Merging dst x y mask)
(VPBLENDMQMasked512 dst (VPSRAQ512const [a] x) mask) => (VPSRAQMasked512constMerging dst [a] x mask)
@@ -2202,8 +2197,6 @@
(VPBLENDMWMasked512 dst (VPMULHW512 x y) mask) => (VPMULHWMasked512Merging dst x y mask)
(VPBLENDMWMasked512 dst (VPMULLW512 x y) mask) => (VPMULLWMasked512Merging dst x y mask)
(VPBLENDMWMasked512 dst (VPOPCNTW512 x) mask) => (VPOPCNTWMasked512Merging dst x mask)
-(VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
-(VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
@@ -2263,7 +2256,6 @@
(VPBLENDVB128 dst (VPADDUSB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPADDUSW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSWMasked128Merging dst x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPADDW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDWMasked128Merging dst x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPALIGNR128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPALIGNRMasked128Merging dst [a] x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPAVGB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPAVGW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGWMasked128Merging dst x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPBROADCASTB128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPBROADCASTBMasked128Merging dst x (VPMOVVec8x16ToM <types.TypeMask> mask))
@@ -2363,12 +2355,6 @@
(VPBLENDVB128 dst (VPRORQ128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPRORQMasked128Merging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPRORVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPRORVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPRORVQ128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPRORVQMasked128Merging dst x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPSHLDD128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHLDDMasked128Merging dst [a] x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPSHLDQ128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHLDQMasked128Merging dst [a] x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPSHLDW128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHLDWMasked128Merging dst [a] x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPSHRDD128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHRDDMasked128Merging dst [a] x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPSHRDQ128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHRDQMasked128Merging dst [a] x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(VPBLENDVB128 dst (VPSHRDW128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHRDWMasked128Merging dst [a] x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
@@ -2454,7 +2440,6 @@
(VPBLENDVB256 dst (VPADDUSB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPADDUSW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSWMasked256Merging dst x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPADDW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDWMasked256Merging dst x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPALIGNR256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPALIGNRMasked256Merging dst [a] x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPAVGB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPAVGW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGWMasked256Merging dst x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPLZCNTD256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPLZCNTDMasked256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -2518,12 +2503,6 @@
(VPBLENDVB256 dst (VPRORQ256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPRORQMasked256Merging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPRORVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPRORVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPRORVQ256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPRORVQMasked256Merging dst x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPSHLDD256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHLDDMasked256Merging dst [a] x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPSHLDQ256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHLDQMasked256Merging dst [a] x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPSHLDW256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHLDWMasked256Merging dst [a] x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPSHRDD256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHRDDMasked256Merging dst [a] x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPSHRDQ256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHRDQMasked256Merging dst [a] x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(VPBLENDVB256 dst (VPSHRDW256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHRDWMasked256Merging dst [a] x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index 8cd409f..4adfb4f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -2540,9 +2540,6 @@
{name: "VSUBPSMasked128Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", typ: "Vec128", resultInArg0: true},
{name: "VSUBPSMasked256Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", typ: "Vec256", resultInArg0: true},
{name: "VSUBPSMasked512Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", typ: "Vec512", resultInArg0: true},
- {name: "VPALIGNRMasked128Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPALIGNRMasked256Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPALIGNRMasked512Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", typ: "Vec512", resultInArg0: true},
{name: "VPROLDMasked128Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", typ: "Vec128", resultInArg0: true},
{name: "VPROLDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", typ: "Vec256", resultInArg0: true},
{name: "VPROLDMasked512Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", typ: "Vec512", resultInArg0: true},
@@ -2555,24 +2552,6 @@
{name: "VPRORQMasked128Merging", argLength: 3, reg: w2kw, asm: "VPRORQ", aux: "UInt8", typ: "Vec128", resultInArg0: true},
{name: "VPRORQMasked256Merging", argLength: 3, reg: w2kw, asm: "VPRORQ", aux: "UInt8", typ: "Vec256", resultInArg0: true},
{name: "VPRORQMasked512Merging", argLength: 3, reg: w2kw, asm: "VPRORQ", aux: "UInt8", typ: "Vec512", resultInArg0: true},
- {name: "VPSHLDDMasked128Merging", argLength: 4, reg: w3kw, asm: "VPSHLDD", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPSHLDDMasked256Merging", argLength: 4, reg: w3kw, asm: "VPSHLDD", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPSHLDDMasked512Merging", argLength: 4, reg: w3kw, asm: "VPSHLDD", aux: "UInt8", typ: "Vec512", resultInArg0: true},
- {name: "VPSHLDQMasked128Merging", argLength: 4, reg: w3kw, asm: "VPSHLDQ", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPSHLDQMasked256Merging", argLength: 4, reg: w3kw, asm: "VPSHLDQ", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPSHLDQMasked512Merging", argLength: 4, reg: w3kw, asm: "VPSHLDQ", aux: "UInt8", typ: "Vec512", resultInArg0: true},
- {name: "VPSHLDWMasked128Merging", argLength: 4, reg: w3kw, asm: "VPSHLDW", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPSHLDWMasked256Merging", argLength: 4, reg: w3kw, asm: "VPSHLDW", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPSHLDWMasked512Merging", argLength: 4, reg: w3kw, asm: "VPSHLDW", aux: "UInt8", typ: "Vec512", resultInArg0: true},
- {name: "VPSHRDDMasked128Merging", argLength: 4, reg: w3kw, asm: "VPSHRDD", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPSHRDDMasked256Merging", argLength: 4, reg: w3kw, asm: "VPSHRDD", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPSHRDDMasked512Merging", argLength: 4, reg: w3kw, asm: "VPSHRDD", aux: "UInt8", typ: "Vec512", resultInArg0: true},
- {name: "VPSHRDQMasked128Merging", argLength: 4, reg: w3kw, asm: "VPSHRDQ", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPSHRDQMasked256Merging", argLength: 4, reg: w3kw, asm: "VPSHRDQ", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPSHRDQMasked512Merging", argLength: 4, reg: w3kw, asm: "VPSHRDQ", aux: "UInt8", typ: "Vec512", resultInArg0: true},
- {name: "VPSHRDWMasked128Merging", argLength: 4, reg: w3kw, asm: "VPSHRDW", aux: "UInt8", typ: "Vec128", resultInArg0: true},
- {name: "VPSHRDWMasked256Merging", argLength: 4, reg: w3kw, asm: "VPSHRDW", aux: "UInt8", typ: "Vec256", resultInArg0: true},
- {name: "VPSHRDWMasked512Merging", argLength: 4, reg: w3kw, asm: "VPSHRDW", aux: "UInt8", typ: "Vec512", resultInArg0: true},
{name: "VPSHUFDMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFD", aux: "UInt8", typ: "Vec128", resultInArg0: true},
{name: "VPSHUFDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFD", aux: "UInt8", typ: "Vec256", resultInArg0: true},
{name: "VPSHUFDMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFD", aux: "UInt8", typ: "Vec512", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 50e7df3..f3d6832 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -3806,9 +3806,6 @@
OpAMD64VSUBPSMasked128Merging
OpAMD64VSUBPSMasked256Merging
OpAMD64VSUBPSMasked512Merging
- OpAMD64VPALIGNRMasked128Merging
- OpAMD64VPALIGNRMasked256Merging
- OpAMD64VPALIGNRMasked512Merging
OpAMD64VPROLDMasked128Merging
OpAMD64VPROLDMasked256Merging
OpAMD64VPROLDMasked512Merging
@@ -3821,24 +3818,6 @@
OpAMD64VPRORQMasked128Merging
OpAMD64VPRORQMasked256Merging
OpAMD64VPRORQMasked512Merging
- OpAMD64VPSHLDDMasked128Merging
- OpAMD64VPSHLDDMasked256Merging
- OpAMD64VPSHLDDMasked512Merging
- OpAMD64VPSHLDQMasked128Merging
- OpAMD64VPSHLDQMasked256Merging
- OpAMD64VPSHLDQMasked512Merging
- OpAMD64VPSHLDWMasked128Merging
- OpAMD64VPSHLDWMasked256Merging
- OpAMD64VPSHLDWMasked512Merging
- OpAMD64VPSHRDDMasked128Merging
- OpAMD64VPSHRDDMasked256Merging
- OpAMD64VPSHRDDMasked512Merging
- OpAMD64VPSHRDQMasked128Merging
- OpAMD64VPSHRDQMasked256Merging
- OpAMD64VPSHRDQMasked512Merging
- OpAMD64VPSHRDWMasked128Merging
- OpAMD64VPSHRDWMasked256Merging
- OpAMD64VPSHRDWMasked512Merging
OpAMD64VPSHUFDMasked128Merging
OpAMD64VPSHUFDMasked256Merging
OpAMD64VPSHUFDMasked512Merging
@@ -60239,60 +60218,6 @@
},
},
{
- name: "VPALIGNRMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPALIGNR,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPALIGNRMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPALIGNR,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPALIGNRMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPALIGNR,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
name: "VPROLDMasked128Merging",
auxType: auxUInt8,
argLen: 3,
@@ -60497,330 +60422,6 @@
},
},
{
- name: "VPSHLDDMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDD,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDDMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDD,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDDMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDD,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDQMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDQ,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDQMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDQ,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDQMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDQ,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDWMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDW,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDWMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDW,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHLDWMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHLDW,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDDMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDD,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDDMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDD,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDDMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDD,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDQMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDQ,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDQMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDQ,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDQMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDQ,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDWMasked128Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDW,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDWMasked256Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDW,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
- name: "VPSHRDWMasked512Merging",
- auxType: auxUInt8,
- argLen: 4,
- resultInArg0: true,
- asm: x86.AVPSHRDW,
- reg: regInfo{
- inputs: []inputInfo{
- {3, regMask{v1: 71494644084506624, v2: 0}}, // K1 K2 K3 K4 K5 K6 K7
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {1, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- {2, regMask{v1: 281474976645120, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- outputs: []outputInfo{
- {0, regMask{v1: 281472829161472, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
- },
- },
- },
- {
name: "VPSHUFDMasked128Merging",
auxType: auxUInt8,
argLen: 3,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index dc3c553..648b7b8 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -43559,22 +43559,6 @@
v.AddArg4(dst, x, y, mask)
return true
}
- // match: (VPBLENDMBMasked512 dst (VPALIGNR512 [a] x y) mask)
- // result: (VPALIGNRMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPALIGNR512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPALIGNRMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
// match: (VPBLENDMBMasked512 dst (VPAVGB512 x y) mask)
// result: (VPAVGBMasked512Merging dst x y mask)
for {
@@ -44157,38 +44141,6 @@
v.AddArg4(dst, x, y, mask)
return true
}
- // match: (VPBLENDMDMasked512 dst (VPSHLDD512 [a] x y) mask)
- // result: (VPSHLDDMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDD512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPSHLDDMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
- // match: (VPBLENDMDMasked512 dst (VPSHRDD512 [a] x y) mask)
- // result: (VPSHRDDMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDD512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPSHRDDMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
// match: (VPBLENDMDMasked512 dst (VPSHUFD512 [a] x) mask)
// result: (VPSHUFDMasked512Merging dst [a] x mask)
for {
@@ -44958,38 +44910,6 @@
v.AddArg4(dst, x, y, mask)
return true
}
- // match: (VPBLENDMQMasked512 dst (VPSHLDQ512 [a] x y) mask)
- // result: (VPSHLDQMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDQ512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPSHLDQMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
- // match: (VPBLENDMQMasked512 dst (VPSHRDQ512 [a] x y) mask)
- // result: (VPSHRDQMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDQ512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPSHRDQMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
// match: (VPBLENDMQMasked512 dst (VPSLLQ512const [a] x) mask)
// result: (VPSLLQMasked512constMerging dst [a] x mask)
for {
@@ -45478,38 +45398,6 @@
v.AddArg3(dst, x, mask)
return true
}
- // match: (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask)
- // result: (VPSHLDWMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDW512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPSHLDWMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
- // match: (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask)
- // result: (VPSHRDWMasked512Merging dst [a] x y mask)
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDW512 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- v.reset(OpAMD64VPSHRDWMasked512Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg4(dst, x, y, mask)
- return true
- }
// match: (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask)
// result: (VPSHUFHWMasked512Merging dst [a] x mask)
for {
@@ -46609,28 +46497,6 @@
v.AddArg4(dst, x, y, v0)
return true
}
- // match: (VPBLENDVB128 dst (VPALIGNR128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPALIGNRMasked128Merging dst [a] x y (VPMOVVec8x16ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPALIGNR128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPALIGNRMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
// match: (VPBLENDVB128 dst (VPAVGB128 x y) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPAVGBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
@@ -48549,138 +48415,6 @@
v.AddArg4(dst, x, y, v0)
return true
}
- // match: (VPBLENDVB128 dst (VPSHLDD128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHLDDMasked128Merging dst [a] x y (VPMOVVec32x4ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDD128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHLDDMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB128 dst (VPSHLDQ128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHLDQMasked128Merging dst [a] x y (VPMOVVec64x2ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDQ128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHLDQMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB128 dst (VPSHLDW128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHLDWMasked128Merging dst [a] x y (VPMOVVec16x8ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDW128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHLDWMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB128 dst (VPSHRDD128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHRDDMasked128Merging dst [a] x y (VPMOVVec32x4ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDD128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHRDDMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB128 dst (VPSHRDQ128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHRDQMasked128Merging dst [a] x y (VPMOVVec64x2ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDQ128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHRDQMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB128 dst (VPSHRDW128 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHRDWMasked128Merging dst [a] x y (VPMOVVec16x8ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDW128 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHRDWMasked128Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
// match: (VPBLENDVB128 dst (VPSHUFB128 x y) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
@@ -50378,28 +50112,6 @@
v.AddArg4(dst, x, y, v0)
return true
}
- // match: (VPBLENDVB256 dst (VPALIGNR256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPALIGNRMasked256Merging dst [a] x y (VPMOVVec8x32ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPALIGNR256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPALIGNRMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
// match: (VPBLENDVB256 dst (VPAVGB256 x y) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPAVGBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
@@ -51634,138 +51346,6 @@
v.AddArg4(dst, x, y, v0)
return true
}
- // match: (VPBLENDVB256 dst (VPSHLDD256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHLDDMasked256Merging dst [a] x y (VPMOVVec32x8ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDD256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHLDDMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB256 dst (VPSHLDQ256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHLDQMasked256Merging dst [a] x y (VPMOVVec64x4ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDQ256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHLDQMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB256 dst (VPSHLDW256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHLDWMasked256Merging dst [a] x y (VPMOVVec16x16ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHLDW256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHLDWMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB256 dst (VPSHRDD256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHRDDMasked256Merging dst [a] x y (VPMOVVec32x8ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDD256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHRDDMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB256 dst (VPSHRDQ256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHRDQMasked256Merging dst [a] x y (VPMOVVec64x4ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDQ256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHRDQMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
- // match: (VPBLENDVB256 dst (VPSHRDW256 [a] x y) mask)
- // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
- // result: (VPSHRDWMasked256Merging dst [a] x y (VPMOVVec16x16ToM <types.TypeMask> mask))
- for {
- dst := v_0
- if v_1.Op != OpAMD64VPSHRDW256 {
- break
- }
- a := auxIntToUint8(v_1.AuxInt)
- y := v_1.Args[1]
- x := v_1.Args[0]
- mask := v_2
- if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
- break
- }
- v.reset(OpAMD64VPSHRDWMasked256Merging)
- v.AuxInt = uint8ToAuxInt(a)
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
- v0.AddArg(mask)
- v.AddArg4(dst, x, y, v0)
- return true
- }
// match: (VPBLENDVB256 dst (VPSHUFB256 x y) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index e8cab1a..326a7ee 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -226,9 +226,9 @@
addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Float64x2.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x8.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x8, types.TypeVec256), sys.AMD64)
@@ -992,24 +992,24 @@
addF(simdPackage, "Uint64x2.ShiftAllLeft", opLen2(ssa.OpShiftAllLeftUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.ShiftAllLeft", opLen2(ssa.OpShiftAllLeftUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.ShiftAllLeft", opLen2(ssa.OpShiftAllLeftUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x32.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int32x4.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x8.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int32x16.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int64x2.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int64x4.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int64x8.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatInt64x8, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint16x8.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x32.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x8.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint32x16.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint64x2.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint64x4.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint64x8.ShiftAllLeftConcat", opLen2Imm8(ssa.OpShiftAllLeftConcatUint64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int16x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x32.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int32x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int32x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int32x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int64x2.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int64x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int64x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.ShiftAllRight", opLen2(ssa.OpShiftAllRightInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ShiftAllRight", opLen2(ssa.OpShiftAllRightInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ShiftAllRight", opLen2(ssa.OpShiftAllRightInt16x32, types.TypeVec512), sys.AMD64)
@@ -1028,24 +1028,24 @@
addF(simdPackage, "Uint64x2.ShiftAllRight", opLen2(ssa.OpShiftAllRightUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.ShiftAllRight", opLen2(ssa.OpShiftAllRightUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.ShiftAllRight", opLen2(ssa.OpShiftAllRightUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x32.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int32x4.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x8.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int32x16.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int64x2.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int64x4.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int64x8.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatInt64x8, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint16x8.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x32.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x8.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint32x16.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint64x2.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint64x4.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint64x8.ShiftAllRightConcat", opLen2Imm8(ssa.OpShiftAllRightConcatUint64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int16x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x32.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int32x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int32x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int32x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int64x2.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int64x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int64x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.ShiftLeft", opLen2(ssa.OpShiftLeftInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ShiftLeft", opLen2(ssa.OpShiftLeftInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ShiftLeft", opLen2(ssa.OpShiftLeftInt16x32, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index 1c0e371..927f88c 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -1027,6 +1027,7 @@
- go: ConcatShiftBytesRight
asm: VPALIGNR
+ operandOrder: 2I
in:
- &uint128
go: $t
@@ -1041,6 +1042,7 @@
- go: ConcatShiftBytesRightGrouped
asm: VPALIGNR
+ operandOrder: 2I
in:
- &uint256512
go: $t
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml
index e2cc2ba..ffbc6da 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml
@@ -158,6 +158,7 @@
# Bizzare shifts.
- go: ShiftAllLeftConcat
asm: "VPSHLD[WDQ]"
+ operandOrder: 2I
in:
- *any
- *any
@@ -166,6 +167,7 @@
- *any
- go: ShiftAllRightConcat
asm: "VPSHRD[WDQ]"
+ operandOrder: 2I
in:
- *any
- *any
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index 288eeb4..5c03258 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -1407,7 +1407,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPALIGNR, CPU Feature: AVX
-func (x Uint8x16) ConcatShiftBytesRight(shift uint8, y Uint8x16) Uint8x16
+func (x Uint8x16) ConcatShiftBytesRight(y Uint8x16, shift uint8) Uint8x16
/* ConcatShiftBytesRightGrouped */
@@ -1418,7 +1418,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPALIGNR, CPU Feature: AVX2
-func (x Uint8x32) ConcatShiftBytesRightGrouped(shift uint8, y Uint8x32) Uint8x32
+func (x Uint8x32) ConcatShiftBytesRightGrouped(y Uint8x32, shift uint8) Uint8x32
// ConcatShiftBytesRightGrouped concatenates x and y and shifts it right by shift bytes.
// The result vector will be the lower half of the concatenated vector.
@@ -1427,7 +1427,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPALIGNR, CPU Feature: AVX512
-func (x Uint8x64) ConcatShiftBytesRightGrouped(shift uint8, y Uint8x64) Uint8x64
+func (x Uint8x64) ConcatShiftBytesRightGrouped(y Uint8x64, shift uint8) Uint8x64
/* ConvertToFloat32 */
@@ -5994,7 +5994,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftAllLeftConcat(shift uint8, y Int16x8) Int16x8
+func (x Int16x8) ShiftAllLeftConcat(y Int16x8, shift uint8) Int16x8
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6004,7 +6004,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftAllLeftConcat(shift uint8, y Int16x16) Int16x16
+func (x Int16x16) ShiftAllLeftConcat(y Int16x16, shift uint8) Int16x16
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6014,7 +6014,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftAllLeftConcat(shift uint8, y Int16x32) Int16x32
+func (x Int16x32) ShiftAllLeftConcat(y Int16x32, shift uint8) Int16x32
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6024,7 +6024,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftAllLeftConcat(shift uint8, y Int32x4) Int32x4
+func (x Int32x4) ShiftAllLeftConcat(y Int32x4, shift uint8) Int32x4
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6034,7 +6034,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftAllLeftConcat(shift uint8, y Int32x8) Int32x8
+func (x Int32x8) ShiftAllLeftConcat(y Int32x8, shift uint8) Int32x8
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6044,7 +6044,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftAllLeftConcat(shift uint8, y Int32x16) Int32x16
+func (x Int32x16) ShiftAllLeftConcat(y Int32x16, shift uint8) Int32x16
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6054,7 +6054,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftAllLeftConcat(shift uint8, y Int64x2) Int64x2
+func (x Int64x2) ShiftAllLeftConcat(y Int64x2, shift uint8) Int64x2
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6064,7 +6064,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftAllLeftConcat(shift uint8, y Int64x4) Int64x4
+func (x Int64x4) ShiftAllLeftConcat(y Int64x4, shift uint8) Int64x4
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6074,7 +6074,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftAllLeftConcat(shift uint8, y Int64x8) Int64x8
+func (x Int64x8) ShiftAllLeftConcat(y Int64x8, shift uint8) Int64x8
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6084,7 +6084,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftAllLeftConcat(shift uint8, y Uint16x8) Uint16x8
+func (x Uint16x8) ShiftAllLeftConcat(y Uint16x8, shift uint8) Uint16x8
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6094,7 +6094,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftAllLeftConcat(shift uint8, y Uint16x16) Uint16x16
+func (x Uint16x16) ShiftAllLeftConcat(y Uint16x16, shift uint8) Uint16x16
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6104,7 +6104,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftAllLeftConcat(shift uint8, y Uint16x32) Uint16x32
+func (x Uint16x32) ShiftAllLeftConcat(y Uint16x32, shift uint8) Uint16x32
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6114,7 +6114,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftAllLeftConcat(shift uint8, y Uint32x4) Uint32x4
+func (x Uint32x4) ShiftAllLeftConcat(y Uint32x4, shift uint8) Uint32x4
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6124,7 +6124,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftAllLeftConcat(shift uint8, y Uint32x8) Uint32x8
+func (x Uint32x8) ShiftAllLeftConcat(y Uint32x8, shift uint8) Uint32x8
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6134,7 +6134,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftAllLeftConcat(shift uint8, y Uint32x16) Uint32x16
+func (x Uint32x16) ShiftAllLeftConcat(y Uint32x16, shift uint8) Uint32x16
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6144,7 +6144,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftAllLeftConcat(shift uint8, y Uint64x2) Uint64x2
+func (x Uint64x2) ShiftAllLeftConcat(y Uint64x2, shift uint8) Uint64x2
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6154,7 +6154,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftAllLeftConcat(shift uint8, y Uint64x4) Uint64x4
+func (x Uint64x4) ShiftAllLeftConcat(y Uint64x4, shift uint8) Uint64x4
// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
@@ -6164,7 +6164,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftAllLeftConcat(shift uint8, y Uint64x8) Uint64x8
+func (x Uint64x8) ShiftAllLeftConcat(y Uint64x8, shift uint8) Uint64x8
/* ShiftAllRight */
@@ -6286,7 +6286,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftAllRightConcat(shift uint8, y Int16x8) Int16x8
+func (x Int16x8) ShiftAllRightConcat(y Int16x8, shift uint8) Int16x8
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6296,7 +6296,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftAllRightConcat(shift uint8, y Int16x16) Int16x16
+func (x Int16x16) ShiftAllRightConcat(y Int16x16, shift uint8) Int16x16
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6306,7 +6306,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftAllRightConcat(shift uint8, y Int16x32) Int16x32
+func (x Int16x32) ShiftAllRightConcat(y Int16x32, shift uint8) Int16x32
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6316,7 +6316,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftAllRightConcat(shift uint8, y Int32x4) Int32x4
+func (x Int32x4) ShiftAllRightConcat(y Int32x4, shift uint8) Int32x4
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6326,7 +6326,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftAllRightConcat(shift uint8, y Int32x8) Int32x8
+func (x Int32x8) ShiftAllRightConcat(y Int32x8, shift uint8) Int32x8
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6336,7 +6336,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftAllRightConcat(shift uint8, y Int32x16) Int32x16
+func (x Int32x16) ShiftAllRightConcat(y Int32x16, shift uint8) Int32x16
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6346,7 +6346,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftAllRightConcat(shift uint8, y Int64x2) Int64x2
+func (x Int64x2) ShiftAllRightConcat(y Int64x2, shift uint8) Int64x2
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6356,7 +6356,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftAllRightConcat(shift uint8, y Int64x4) Int64x4
+func (x Int64x4) ShiftAllRightConcat(y Int64x4, shift uint8) Int64x4
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6366,7 +6366,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftAllRightConcat(shift uint8, y Int64x8) Int64x8
+func (x Int64x8) ShiftAllRightConcat(y Int64x8, shift uint8) Int64x8
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6376,7 +6376,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftAllRightConcat(shift uint8, y Uint16x8) Uint16x8
+func (x Uint16x8) ShiftAllRightConcat(y Uint16x8, shift uint8) Uint16x8
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6386,7 +6386,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftAllRightConcat(shift uint8, y Uint16x16) Uint16x16
+func (x Uint16x16) ShiftAllRightConcat(y Uint16x16, shift uint8) Uint16x16
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6396,7 +6396,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftAllRightConcat(shift uint8, y Uint16x32) Uint16x32
+func (x Uint16x32) ShiftAllRightConcat(y Uint16x32, shift uint8) Uint16x32
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6406,7 +6406,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftAllRightConcat(shift uint8, y Uint32x4) Uint32x4
+func (x Uint32x4) ShiftAllRightConcat(y Uint32x4, shift uint8) Uint32x4
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6416,7 +6416,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftAllRightConcat(shift uint8, y Uint32x8) Uint32x8
+func (x Uint32x8) ShiftAllRightConcat(y Uint32x8, shift uint8) Uint32x8
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6426,7 +6426,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftAllRightConcat(shift uint8, y Uint32x16) Uint32x16
+func (x Uint32x16) ShiftAllRightConcat(y Uint32x16, shift uint8) Uint32x16
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6436,7 +6436,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftAllRightConcat(shift uint8, y Uint64x2) Uint64x2
+func (x Uint64x2) ShiftAllRightConcat(y Uint64x2, shift uint8) Uint64x2
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6446,7 +6446,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftAllRightConcat(shift uint8, y Uint64x4) Uint64x4
+func (x Uint64x4) ShiftAllRightConcat(y Uint64x4, shift uint8) Uint64x4
// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
@@ -6456,7 +6456,7 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftAllRightConcat(shift uint8, y Uint64x8) Uint64x8
+func (x Uint64x8) ShiftAllRightConcat(y Uint64x8, shift uint8) Uint64x8
/* ShiftLeft */