diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index ff76cc0..620fe56 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -991,24 +991,24 @@
(ShiftAllLeftUint64x2 ...) => (VPSLLQ128 ...)
(ShiftAllLeftUint64x4 ...) => (VPSLLQ256 ...)
(ShiftAllLeftUint64x8 ...) => (VPSLLQ512 ...)
-(ShiftAllLeftConcatInt16x8 ...) => (VPSHLDW128 ...)
-(ShiftAllLeftConcatInt16x16 ...) => (VPSHLDW256 ...)
-(ShiftAllLeftConcatInt16x32 ...) => (VPSHLDW512 ...)
-(ShiftAllLeftConcatInt32x4 ...) => (VPSHLDD128 ...)
-(ShiftAllLeftConcatInt32x8 ...) => (VPSHLDD256 ...)
-(ShiftAllLeftConcatInt32x16 ...) => (VPSHLDD512 ...)
-(ShiftAllLeftConcatInt64x2 ...) => (VPSHLDQ128 ...)
-(ShiftAllLeftConcatInt64x4 ...) => (VPSHLDQ256 ...)
-(ShiftAllLeftConcatInt64x8 ...) => (VPSHLDQ512 ...)
-(ShiftAllLeftConcatUint16x8 ...) => (VPSHLDW128 ...)
-(ShiftAllLeftConcatUint16x16 ...) => (VPSHLDW256 ...)
-(ShiftAllLeftConcatUint16x32 ...) => (VPSHLDW512 ...)
-(ShiftAllLeftConcatUint32x4 ...) => (VPSHLDD128 ...)
-(ShiftAllLeftConcatUint32x8 ...) => (VPSHLDD256 ...)
-(ShiftAllLeftConcatUint32x16 ...) => (VPSHLDD512 ...)
-(ShiftAllLeftConcatUint64x2 ...) => (VPSHLDQ128 ...)
-(ShiftAllLeftConcatUint64x4 ...) => (VPSHLDQ256 ...)
-(ShiftAllLeftConcatUint64x8 ...) => (VPSHLDQ512 ...)
+(ShiftAllLeftConcatMod16Int16x8 ...) => (VPSHLDW128 ...)
+(ShiftAllLeftConcatMod16Int16x16 ...) => (VPSHLDW256 ...)
+(ShiftAllLeftConcatMod16Int16x32 ...) => (VPSHLDW512 ...)
+(ShiftAllLeftConcatMod16Uint16x8 ...) => (VPSHLDW128 ...)
+(ShiftAllLeftConcatMod16Uint16x16 ...) => (VPSHLDW256 ...)
+(ShiftAllLeftConcatMod16Uint16x32 ...) => (VPSHLDW512 ...)
+(ShiftAllLeftConcatMod32Int32x4 ...) => (VPSHLDD128 ...)
+(ShiftAllLeftConcatMod32Int32x8 ...) => (VPSHLDD256 ...)
+(ShiftAllLeftConcatMod32Int32x16 ...) => (VPSHLDD512 ...)
+(ShiftAllLeftConcatMod32Uint32x4 ...) => (VPSHLDD128 ...)
+(ShiftAllLeftConcatMod32Uint32x8 ...) => (VPSHLDD256 ...)
+(ShiftAllLeftConcatMod32Uint32x16 ...) => (VPSHLDD512 ...)
+(ShiftAllLeftConcatMod64Int64x2 ...) => (VPSHLDQ128 ...)
+(ShiftAllLeftConcatMod64Int64x4 ...) => (VPSHLDQ256 ...)
+(ShiftAllLeftConcatMod64Int64x8 ...) => (VPSHLDQ512 ...)
+(ShiftAllLeftConcatMod64Uint64x2 ...) => (VPSHLDQ128 ...)
+(ShiftAllLeftConcatMod64Uint64x4 ...) => (VPSHLDQ256 ...)
+(ShiftAllLeftConcatMod64Uint64x8 ...) => (VPSHLDQ512 ...)
(VPSLLWMasked128 x (MOVQconst [c]) mask) => (VPSLLWMasked128const [amd64CapAVXShift(c)] x mask)
(VPSLLWMasked256 x (MOVQconst [c]) mask) => (VPSLLWMasked256const [amd64CapAVXShift(c)] x mask)
(VPSLLWMasked512 x (MOVQconst [c]) mask) => (VPSLLWMasked512const [amd64CapAVXShift(c)] x mask)
@@ -1054,24 +1054,24 @@
(VPSRLQ256 x (MOVQconst [c])) => (VPSRLQ256const [amd64CapAVXShift(c)] x)
(ShiftAllRightUint64x8 ...) => (VPSRLQ512 ...)
(VPSRLQ512 x (MOVQconst [c])) => (VPSRLQ512const [amd64CapAVXShift(c)] x)
-(ShiftAllRightConcatInt16x8 ...) => (VPSHRDW128 ...)
-(ShiftAllRightConcatInt16x16 ...) => (VPSHRDW256 ...)
-(ShiftAllRightConcatInt16x32 ...) => (VPSHRDW512 ...)
-(ShiftAllRightConcatInt32x4 ...) => (VPSHRDD128 ...)
-(ShiftAllRightConcatInt32x8 ...) => (VPSHRDD256 ...)
-(ShiftAllRightConcatInt32x16 ...) => (VPSHRDD512 ...)
-(ShiftAllRightConcatInt64x2 ...) => (VPSHRDQ128 ...)
-(ShiftAllRightConcatInt64x4 ...) => (VPSHRDQ256 ...)
-(ShiftAllRightConcatInt64x8 ...) => (VPSHRDQ512 ...)
-(ShiftAllRightConcatUint16x8 ...) => (VPSHRDW128 ...)
-(ShiftAllRightConcatUint16x16 ...) => (VPSHRDW256 ...)
-(ShiftAllRightConcatUint16x32 ...) => (VPSHRDW512 ...)
-(ShiftAllRightConcatUint32x4 ...) => (VPSHRDD128 ...)
-(ShiftAllRightConcatUint32x8 ...) => (VPSHRDD256 ...)
-(ShiftAllRightConcatUint32x16 ...) => (VPSHRDD512 ...)
-(ShiftAllRightConcatUint64x2 ...) => (VPSHRDQ128 ...)
-(ShiftAllRightConcatUint64x4 ...) => (VPSHRDQ256 ...)
-(ShiftAllRightConcatUint64x8 ...) => (VPSHRDQ512 ...)
+(ShiftAllRightConcatMod16Int16x8 ...) => (VPSHRDW128 ...)
+(ShiftAllRightConcatMod16Int16x16 ...) => (VPSHRDW256 ...)
+(ShiftAllRightConcatMod16Int16x32 ...) => (VPSHRDW512 ...)
+(ShiftAllRightConcatMod16Uint16x8 ...) => (VPSHRDW128 ...)
+(ShiftAllRightConcatMod16Uint16x16 ...) => (VPSHRDW256 ...)
+(ShiftAllRightConcatMod16Uint16x32 ...) => (VPSHRDW512 ...)
+(ShiftAllRightConcatMod32Int32x4 ...) => (VPSHRDD128 ...)
+(ShiftAllRightConcatMod32Int32x8 ...) => (VPSHRDD256 ...)
+(ShiftAllRightConcatMod32Int32x16 ...) => (VPSHRDD512 ...)
+(ShiftAllRightConcatMod32Uint32x4 ...) => (VPSHRDD128 ...)
+(ShiftAllRightConcatMod32Uint32x8 ...) => (VPSHRDD256 ...)
+(ShiftAllRightConcatMod32Uint32x16 ...) => (VPSHRDD512 ...)
+(ShiftAllRightConcatMod64Int64x2 ...) => (VPSHRDQ128 ...)
+(ShiftAllRightConcatMod64Int64x4 ...) => (VPSHRDQ256 ...)
+(ShiftAllRightConcatMod64Int64x8 ...) => (VPSHRDQ512 ...)
+(ShiftAllRightConcatMod64Uint64x2 ...) => (VPSHRDQ128 ...)
+(ShiftAllRightConcatMod64Uint64x4 ...) => (VPSHRDQ256 ...)
+(ShiftAllRightConcatMod64Uint64x8 ...) => (VPSHRDQ512 ...)
(VPSRAWMasked128 x (MOVQconst [c]) mask) => (VPSRAWMasked128const [amd64CapAVXShift(c)] x mask)
(VPSRAWMasked256 x (MOVQconst [c]) mask) => (VPSRAWMasked256const [amd64CapAVXShift(c)] x mask)
(VPSRAWMasked512 x (MOVQconst [c]) mask) => (VPSRAWMasked512const [amd64CapAVXShift(c)] x mask)
@@ -1108,24 +1108,24 @@
(ShiftLeftUint64x2 ...) => (VPSLLVQ128 ...)
(ShiftLeftUint64x4 ...) => (VPSLLVQ256 ...)
(ShiftLeftUint64x8 ...) => (VPSLLVQ512 ...)
-(ShiftLeftConcatInt16x8 ...) => (VPSHLDVW128 ...)
-(ShiftLeftConcatInt16x16 ...) => (VPSHLDVW256 ...)
-(ShiftLeftConcatInt16x32 ...) => (VPSHLDVW512 ...)
-(ShiftLeftConcatInt32x4 ...) => (VPSHLDVD128 ...)
-(ShiftLeftConcatInt32x8 ...) => (VPSHLDVD256 ...)
-(ShiftLeftConcatInt32x16 ...) => (VPSHLDVD512 ...)
-(ShiftLeftConcatInt64x2 ...) => (VPSHLDVQ128 ...)
-(ShiftLeftConcatInt64x4 ...) => (VPSHLDVQ256 ...)
-(ShiftLeftConcatInt64x8 ...) => (VPSHLDVQ512 ...)
-(ShiftLeftConcatUint16x8 ...) => (VPSHLDVW128 ...)
-(ShiftLeftConcatUint16x16 ...) => (VPSHLDVW256 ...)
-(ShiftLeftConcatUint16x32 ...) => (VPSHLDVW512 ...)
-(ShiftLeftConcatUint32x4 ...) => (VPSHLDVD128 ...)
-(ShiftLeftConcatUint32x8 ...) => (VPSHLDVD256 ...)
-(ShiftLeftConcatUint32x16 ...) => (VPSHLDVD512 ...)
-(ShiftLeftConcatUint64x2 ...) => (VPSHLDVQ128 ...)
-(ShiftLeftConcatUint64x4 ...) => (VPSHLDVQ256 ...)
-(ShiftLeftConcatUint64x8 ...) => (VPSHLDVQ512 ...)
+(ShiftLeftConcatMod16Int16x8 ...) => (VPSHLDVW128 ...)
+(ShiftLeftConcatMod16Int16x16 ...) => (VPSHLDVW256 ...)
+(ShiftLeftConcatMod16Int16x32 ...) => (VPSHLDVW512 ...)
+(ShiftLeftConcatMod16Uint16x8 ...) => (VPSHLDVW128 ...)
+(ShiftLeftConcatMod16Uint16x16 ...) => (VPSHLDVW256 ...)
+(ShiftLeftConcatMod16Uint16x32 ...) => (VPSHLDVW512 ...)
+(ShiftLeftConcatMod32Int32x4 ...) => (VPSHLDVD128 ...)
+(ShiftLeftConcatMod32Int32x8 ...) => (VPSHLDVD256 ...)
+(ShiftLeftConcatMod32Int32x16 ...) => (VPSHLDVD512 ...)
+(ShiftLeftConcatMod32Uint32x4 ...) => (VPSHLDVD128 ...)
+(ShiftLeftConcatMod32Uint32x8 ...) => (VPSHLDVD256 ...)
+(ShiftLeftConcatMod32Uint32x16 ...) => (VPSHLDVD512 ...)
+(ShiftLeftConcatMod64Int64x2 ...) => (VPSHLDVQ128 ...)
+(ShiftLeftConcatMod64Int64x4 ...) => (VPSHLDVQ256 ...)
+(ShiftLeftConcatMod64Int64x8 ...) => (VPSHLDVQ512 ...)
+(ShiftLeftConcatMod64Uint64x2 ...) => (VPSHLDVQ128 ...)
+(ShiftLeftConcatMod64Uint64x4 ...) => (VPSHLDVQ256 ...)
+(ShiftLeftConcatMod64Uint64x8 ...) => (VPSHLDVQ512 ...)
(ShiftRightInt16x8 ...) => (VPSRAVW128 ...)
(ShiftRightInt16x16 ...) => (VPSRAVW256 ...)
(ShiftRightInt16x32 ...) => (VPSRAVW512 ...)
@@ -1144,24 +1144,24 @@
(ShiftRightUint64x2 ...) => (VPSRLVQ128 ...)
(ShiftRightUint64x4 ...) => (VPSRLVQ256 ...)
(ShiftRightUint64x8 ...) => (VPSRLVQ512 ...)
-(ShiftRightConcatInt16x8 ...) => (VPSHRDVW128 ...)
-(ShiftRightConcatInt16x16 ...) => (VPSHRDVW256 ...)
-(ShiftRightConcatInt16x32 ...) => (VPSHRDVW512 ...)
-(ShiftRightConcatInt32x4 ...) => (VPSHRDVD128 ...)
-(ShiftRightConcatInt32x8 ...) => (VPSHRDVD256 ...)
-(ShiftRightConcatInt32x16 ...) => (VPSHRDVD512 ...)
-(ShiftRightConcatInt64x2 ...) => (VPSHRDVQ128 ...)
-(ShiftRightConcatInt64x4 ...) => (VPSHRDVQ256 ...)
-(ShiftRightConcatInt64x8 ...) => (VPSHRDVQ512 ...)
-(ShiftRightConcatUint16x8 ...) => (VPSHRDVW128 ...)
-(ShiftRightConcatUint16x16 ...) => (VPSHRDVW256 ...)
-(ShiftRightConcatUint16x32 ...) => (VPSHRDVW512 ...)
-(ShiftRightConcatUint32x4 ...) => (VPSHRDVD128 ...)
-(ShiftRightConcatUint32x8 ...) => (VPSHRDVD256 ...)
-(ShiftRightConcatUint32x16 ...) => (VPSHRDVD512 ...)
-(ShiftRightConcatUint64x2 ...) => (VPSHRDVQ128 ...)
-(ShiftRightConcatUint64x4 ...) => (VPSHRDVQ256 ...)
-(ShiftRightConcatUint64x8 ...) => (VPSHRDVQ512 ...)
+(ShiftRightConcatMod16Int16x8 ...) => (VPSHRDVW128 ...)
+(ShiftRightConcatMod16Int16x16 ...) => (VPSHRDVW256 ...)
+(ShiftRightConcatMod16Int16x32 ...) => (VPSHRDVW512 ...)
+(ShiftRightConcatMod16Uint16x8 ...) => (VPSHRDVW128 ...)
+(ShiftRightConcatMod16Uint16x16 ...) => (VPSHRDVW256 ...)
+(ShiftRightConcatMod16Uint16x32 ...) => (VPSHRDVW512 ...)
+(ShiftRightConcatMod32Int32x4 ...) => (VPSHRDVD128 ...)
+(ShiftRightConcatMod32Int32x8 ...) => (VPSHRDVD256 ...)
+(ShiftRightConcatMod32Int32x16 ...) => (VPSHRDVD512 ...)
+(ShiftRightConcatMod32Uint32x4 ...) => (VPSHRDVD128 ...)
+(ShiftRightConcatMod32Uint32x8 ...) => (VPSHRDVD256 ...)
+(ShiftRightConcatMod32Uint32x16 ...) => (VPSHRDVD512 ...)
+(ShiftRightConcatMod64Int64x2 ...) => (VPSHRDVQ128 ...)
+(ShiftRightConcatMod64Int64x4 ...) => (VPSHRDVQ256 ...)
+(ShiftRightConcatMod64Int64x8 ...) => (VPSHRDVQ512 ...)
+(ShiftRightConcatMod64Uint64x2 ...) => (VPSHRDVQ128 ...)
+(ShiftRightConcatMod64Uint64x4 ...) => (VPSHRDVQ256 ...)
+(ShiftRightConcatMod64Uint64x8 ...) => (VPSHRDVQ512 ...)
(SqrtFloat32x4 ...) => (VSQRTPS128 ...)
(SqrtFloat32x8 ...) => (VSQRTPS256 ...)
(SqrtFloat32x16 ...) => (VSQRTPS512 ...)
@@ -3014,24 +3014,24 @@
(VPSHLDD128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHLDD256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHLDD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
-(VPSHLDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
-(VPSHLDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
-(VPSHLDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHLDDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHLDDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHLDDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPSHLDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPSHLDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPSHLDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHLDQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHLDQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHLDQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHRDD128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHRDD256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHRDD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
-(VPSHRDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
-(VPSHRDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
-(VPSHRDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHRDDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHRDDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHRDDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPSHRDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPSHRDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPSHRDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
(VPSHRDQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHRDQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
(VPSHRDQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
@@ -3040,12 +3040,12 @@
(VPSHLDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVD128load {sym} [off] x y ptr mem)
(VPSHLDVD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVD256load {sym} [off] x y ptr mem)
(VPSHLDVD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVD512load {sym} [off] x y ptr mem)
-(VPSHLDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQ128load {sym} [off] x y ptr mem)
-(VPSHLDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQ256load {sym} [off] x y ptr mem)
-(VPSHLDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQ512load {sym} [off] x y ptr mem)
(VPSHLDVDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVDMasked128load {sym} [off] x y ptr mask mem)
(VPSHLDVDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVDMasked256load {sym} [off] x y ptr mask mem)
(VPSHLDVDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVDMasked512load {sym} [off] x y ptr mask mem)
+(VPSHLDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQ128load {sym} [off] x y ptr mem)
+(VPSHLDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQ256load {sym} [off] x y ptr mem)
+(VPSHLDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQ512load {sym} [off] x y ptr mem)
(VPSHLDVQMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQMasked128load {sym} [off] x y ptr mask mem)
(VPSHLDVQMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQMasked256load {sym} [off] x y ptr mask mem)
(VPSHLDVQMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVQMasked512load {sym} [off] x y ptr mask mem)
@@ -3064,12 +3064,12 @@
(VPSHRDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVD128load {sym} [off] x y ptr mem)
(VPSHRDVD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVD256load {sym} [off] x y ptr mem)
(VPSHRDVD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVD512load {sym} [off] x y ptr mem)
-(VPSHRDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQ128load {sym} [off] x y ptr mem)
-(VPSHRDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQ256load {sym} [off] x y ptr mem)
-(VPSHRDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQ512load {sym} [off] x y ptr mem)
(VPSHRDVDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVDMasked128load {sym} [off] x y ptr mask mem)
(VPSHRDVDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVDMasked256load {sym} [off] x y ptr mask mem)
(VPSHRDVDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVDMasked512load {sym} [off] x y ptr mask mem)
+(VPSHRDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQ128load {sym} [off] x y ptr mem)
+(VPSHRDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQ256load {sym} [off] x y ptr mem)
+(VPSHRDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQ512load {sym} [off] x y ptr mem)
(VPSHRDVQMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQMasked128load {sym} [off] x y ptr mask mem)
(VPSHRDVQMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQMasked256load {sym} [off] x y ptr mask mem)
(VPSHRDVQMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDVQMasked512load {sym} [off] x y ptr mask mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 44b9283..99aa9e2 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -903,24 +903,24 @@
{name: "ShiftAllRightUint64x2", argLength: 2}, // ARCH:amd64
{name: "ShiftAllRightUint64x4", argLength: 2}, // ARCH:amd64
{name: "ShiftAllRightUint64x8", argLength: 2}, // ARCH:amd64
- {name: "ShiftLeftConcatInt16x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt16x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt16x32", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt32x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt32x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt32x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt64x2", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt64x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatInt64x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint16x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint16x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint16x32", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint32x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint32x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint32x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint64x2", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint64x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftLeftConcatUint64x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod16Int16x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod16Int16x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod16Int16x32", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod16Uint16x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod16Uint16x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod16Uint16x32", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod32Int32x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod32Int32x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod32Int32x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod32Uint32x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod32Uint32x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod32Uint32x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod64Int64x2", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod64Int64x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod64Int64x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod64Uint64x2", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod64Uint64x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftLeftConcatMod64Uint64x8", argLength: 3}, // ARCH:amd64
{name: "ShiftLeftInt16x8", argLength: 2}, // ARCH:amd64
{name: "ShiftLeftInt16x16", argLength: 2}, // ARCH:amd64
{name: "ShiftLeftInt16x32", argLength: 2}, // ARCH:amd64
@@ -939,24 +939,24 @@
{name: "ShiftLeftUint64x2", argLength: 2}, // ARCH:amd64
{name: "ShiftLeftUint64x4", argLength: 2}, // ARCH:amd64
{name: "ShiftLeftUint64x8", argLength: 2}, // ARCH:amd64
- {name: "ShiftRightConcatInt16x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt16x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt16x32", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt32x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt32x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt32x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt64x2", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt64x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatInt64x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint16x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint16x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint16x32", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint32x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint32x8", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint32x16", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint64x2", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint64x4", argLength: 3}, // ARCH:amd64
- {name: "ShiftRightConcatUint64x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod16Int16x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod16Int16x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod16Int16x32", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod16Uint16x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod16Uint16x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod16Uint16x32", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod32Int32x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod32Int32x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod32Int32x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod32Uint32x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod32Uint32x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod32Uint32x16", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod64Int64x2", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod64Int64x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod64Int64x8", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod64Uint64x2", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod64Uint64x4", argLength: 3}, // ARCH:amd64
+ {name: "ShiftRightConcatMod64Uint64x8", argLength: 3}, // ARCH:amd64
{name: "ShiftRightInt16x8", argLength: 2}, // ARCH:amd64
{name: "ShiftRightInt16x16", argLength: 2}, // ARCH:amd64
{name: "ShiftRightInt16x32", argLength: 2}, // ARCH:amd64
@@ -1271,42 +1271,42 @@
{name: "SetElemUint16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "SetElemUint32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "SetElemUint64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatInt64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllLeftConcatUint64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatInt64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "ShiftAllRightConcatUint64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod16Int16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod16Int16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod16Int16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod16Uint16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod16Uint16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod16Uint16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod32Int32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod32Int32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod32Int32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod32Uint32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod32Uint32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod32Uint32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod64Int64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod64Int64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod64Int64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod64Uint64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod64Uint64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllLeftConcatMod64Uint64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod16Int16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod16Int16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod16Int16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod16Uint16x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod16Uint16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod16Uint16x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod32Int32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod32Int32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod32Int32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod32Uint32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod32Uint32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod32Uint32x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod64Int64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod64Int64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod64Int64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod64Uint64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod64Uint64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ShiftAllRightConcatMod64Uint64x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "TruncScaledFloat32x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "TruncScaledFloat32x8", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "TruncScaledFloat32x16", argLength: 1, aux: "UInt8"}, // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index f3d6832..dd65c70 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -7084,24 +7084,24 @@
OpShiftAllRightUint64x2
OpShiftAllRightUint64x4
OpShiftAllRightUint64x8
- OpShiftLeftConcatInt16x8
- OpShiftLeftConcatInt16x16
- OpShiftLeftConcatInt16x32
- OpShiftLeftConcatInt32x4
- OpShiftLeftConcatInt32x8
- OpShiftLeftConcatInt32x16
- OpShiftLeftConcatInt64x2
- OpShiftLeftConcatInt64x4
- OpShiftLeftConcatInt64x8
- OpShiftLeftConcatUint16x8
- OpShiftLeftConcatUint16x16
- OpShiftLeftConcatUint16x32
- OpShiftLeftConcatUint32x4
- OpShiftLeftConcatUint32x8
- OpShiftLeftConcatUint32x16
- OpShiftLeftConcatUint64x2
- OpShiftLeftConcatUint64x4
- OpShiftLeftConcatUint64x8
+ OpShiftLeftConcatMod16Int16x8
+ OpShiftLeftConcatMod16Int16x16
+ OpShiftLeftConcatMod16Int16x32
+ OpShiftLeftConcatMod16Uint16x8
+ OpShiftLeftConcatMod16Uint16x16
+ OpShiftLeftConcatMod16Uint16x32
+ OpShiftLeftConcatMod32Int32x4
+ OpShiftLeftConcatMod32Int32x8
+ OpShiftLeftConcatMod32Int32x16
+ OpShiftLeftConcatMod32Uint32x4
+ OpShiftLeftConcatMod32Uint32x8
+ OpShiftLeftConcatMod32Uint32x16
+ OpShiftLeftConcatMod64Int64x2
+ OpShiftLeftConcatMod64Int64x4
+ OpShiftLeftConcatMod64Int64x8
+ OpShiftLeftConcatMod64Uint64x2
+ OpShiftLeftConcatMod64Uint64x4
+ OpShiftLeftConcatMod64Uint64x8
OpShiftLeftInt16x8
OpShiftLeftInt16x16
OpShiftLeftInt16x32
@@ -7120,24 +7120,24 @@
OpShiftLeftUint64x2
OpShiftLeftUint64x4
OpShiftLeftUint64x8
- OpShiftRightConcatInt16x8
- OpShiftRightConcatInt16x16
- OpShiftRightConcatInt16x32
- OpShiftRightConcatInt32x4
- OpShiftRightConcatInt32x8
- OpShiftRightConcatInt32x16
- OpShiftRightConcatInt64x2
- OpShiftRightConcatInt64x4
- OpShiftRightConcatInt64x8
- OpShiftRightConcatUint16x8
- OpShiftRightConcatUint16x16
- OpShiftRightConcatUint16x32
- OpShiftRightConcatUint32x4
- OpShiftRightConcatUint32x8
- OpShiftRightConcatUint32x16
- OpShiftRightConcatUint64x2
- OpShiftRightConcatUint64x4
- OpShiftRightConcatUint64x8
+ OpShiftRightConcatMod16Int16x8
+ OpShiftRightConcatMod16Int16x16
+ OpShiftRightConcatMod16Int16x32
+ OpShiftRightConcatMod16Uint16x8
+ OpShiftRightConcatMod16Uint16x16
+ OpShiftRightConcatMod16Uint16x32
+ OpShiftRightConcatMod32Int32x4
+ OpShiftRightConcatMod32Int32x8
+ OpShiftRightConcatMod32Int32x16
+ OpShiftRightConcatMod32Uint32x4
+ OpShiftRightConcatMod32Uint32x8
+ OpShiftRightConcatMod32Uint32x16
+ OpShiftRightConcatMod64Int64x2
+ OpShiftRightConcatMod64Int64x4
+ OpShiftRightConcatMod64Int64x8
+ OpShiftRightConcatMod64Uint64x2
+ OpShiftRightConcatMod64Uint64x4
+ OpShiftRightConcatMod64Uint64x8
OpShiftRightInt16x8
OpShiftRightInt16x16
OpShiftRightInt16x32
@@ -7452,42 +7452,42 @@
OpSetElemUint16x8
OpSetElemUint32x4
OpSetElemUint64x2
- OpShiftAllLeftConcatInt16x8
- OpShiftAllLeftConcatInt16x16
- OpShiftAllLeftConcatInt16x32
- OpShiftAllLeftConcatInt32x4
- OpShiftAllLeftConcatInt32x8
- OpShiftAllLeftConcatInt32x16
- OpShiftAllLeftConcatInt64x2
- OpShiftAllLeftConcatInt64x4
- OpShiftAllLeftConcatInt64x8
- OpShiftAllLeftConcatUint16x8
- OpShiftAllLeftConcatUint16x16
- OpShiftAllLeftConcatUint16x32
- OpShiftAllLeftConcatUint32x4
- OpShiftAllLeftConcatUint32x8
- OpShiftAllLeftConcatUint32x16
- OpShiftAllLeftConcatUint64x2
- OpShiftAllLeftConcatUint64x4
- OpShiftAllLeftConcatUint64x8
- OpShiftAllRightConcatInt16x8
- OpShiftAllRightConcatInt16x16
- OpShiftAllRightConcatInt16x32
- OpShiftAllRightConcatInt32x4
- OpShiftAllRightConcatInt32x8
- OpShiftAllRightConcatInt32x16
- OpShiftAllRightConcatInt64x2
- OpShiftAllRightConcatInt64x4
- OpShiftAllRightConcatInt64x8
- OpShiftAllRightConcatUint16x8
- OpShiftAllRightConcatUint16x16
- OpShiftAllRightConcatUint16x32
- OpShiftAllRightConcatUint32x4
- OpShiftAllRightConcatUint32x8
- OpShiftAllRightConcatUint32x16
- OpShiftAllRightConcatUint64x2
- OpShiftAllRightConcatUint64x4
- OpShiftAllRightConcatUint64x8
+ OpShiftAllLeftConcatMod16Int16x8
+ OpShiftAllLeftConcatMod16Int16x16
+ OpShiftAllLeftConcatMod16Int16x32
+ OpShiftAllLeftConcatMod16Uint16x8
+ OpShiftAllLeftConcatMod16Uint16x16
+ OpShiftAllLeftConcatMod16Uint16x32
+ OpShiftAllLeftConcatMod32Int32x4
+ OpShiftAllLeftConcatMod32Int32x8
+ OpShiftAllLeftConcatMod32Int32x16
+ OpShiftAllLeftConcatMod32Uint32x4
+ OpShiftAllLeftConcatMod32Uint32x8
+ OpShiftAllLeftConcatMod32Uint32x16
+ OpShiftAllLeftConcatMod64Int64x2
+ OpShiftAllLeftConcatMod64Int64x4
+ OpShiftAllLeftConcatMod64Int64x8
+ OpShiftAllLeftConcatMod64Uint64x2
+ OpShiftAllLeftConcatMod64Uint64x4
+ OpShiftAllLeftConcatMod64Uint64x8
+ OpShiftAllRightConcatMod16Int16x8
+ OpShiftAllRightConcatMod16Int16x16
+ OpShiftAllRightConcatMod16Int16x32
+ OpShiftAllRightConcatMod16Uint16x8
+ OpShiftAllRightConcatMod16Uint16x16
+ OpShiftAllRightConcatMod16Uint16x32
+ OpShiftAllRightConcatMod32Int32x4
+ OpShiftAllRightConcatMod32Int32x8
+ OpShiftAllRightConcatMod32Int32x16
+ OpShiftAllRightConcatMod32Uint32x4
+ OpShiftAllRightConcatMod32Uint32x8
+ OpShiftAllRightConcatMod32Uint32x16
+ OpShiftAllRightConcatMod64Int64x2
+ OpShiftAllRightConcatMod64Int64x4
+ OpShiftAllRightConcatMod64Int64x8
+ OpShiftAllRightConcatMod64Uint64x2
+ OpShiftAllRightConcatMod64Uint64x4
+ OpShiftAllRightConcatMod64Uint64x8
OpTruncScaledFloat32x4
OpTruncScaledFloat32x8
OpTruncScaledFloat32x16
@@ -93994,92 +93994,92 @@
generic: true,
},
{
- name: "ShiftLeftConcatInt16x8",
+ name: "ShiftLeftConcatMod16Int16x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt16x16",
+ name: "ShiftLeftConcatMod16Int16x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt16x32",
+ name: "ShiftLeftConcatMod16Int16x32",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt32x4",
+ name: "ShiftLeftConcatMod16Uint16x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt32x8",
+ name: "ShiftLeftConcatMod16Uint16x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt32x16",
+ name: "ShiftLeftConcatMod16Uint16x32",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt64x2",
+ name: "ShiftLeftConcatMod32Int32x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt64x4",
+ name: "ShiftLeftConcatMod32Int32x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatInt64x8",
+ name: "ShiftLeftConcatMod32Int32x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint16x8",
+ name: "ShiftLeftConcatMod32Uint32x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint16x16",
+ name: "ShiftLeftConcatMod32Uint32x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint16x32",
+ name: "ShiftLeftConcatMod32Uint32x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint32x4",
+ name: "ShiftLeftConcatMod64Int64x2",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint32x8",
+ name: "ShiftLeftConcatMod64Int64x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint32x16",
+ name: "ShiftLeftConcatMod64Int64x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint64x2",
+ name: "ShiftLeftConcatMod64Uint64x2",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint64x4",
+ name: "ShiftLeftConcatMod64Uint64x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftLeftConcatUint64x8",
+ name: "ShiftLeftConcatMod64Uint64x8",
argLen: 3,
generic: true,
},
@@ -94174,92 +94174,92 @@
generic: true,
},
{
- name: "ShiftRightConcatInt16x8",
+ name: "ShiftRightConcatMod16Int16x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt16x16",
+ name: "ShiftRightConcatMod16Int16x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt16x32",
+ name: "ShiftRightConcatMod16Int16x32",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt32x4",
+ name: "ShiftRightConcatMod16Uint16x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt32x8",
+ name: "ShiftRightConcatMod16Uint16x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt32x16",
+ name: "ShiftRightConcatMod16Uint16x32",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt64x2",
+ name: "ShiftRightConcatMod32Int32x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt64x4",
+ name: "ShiftRightConcatMod32Int32x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatInt64x8",
+ name: "ShiftRightConcatMod32Int32x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint16x8",
+ name: "ShiftRightConcatMod32Uint32x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint16x16",
+ name: "ShiftRightConcatMod32Uint32x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint16x32",
+ name: "ShiftRightConcatMod32Uint32x16",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint32x4",
+ name: "ShiftRightConcatMod64Int64x2",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint32x8",
+ name: "ShiftRightConcatMod64Int64x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint32x16",
+ name: "ShiftRightConcatMod64Int64x8",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint64x2",
+ name: "ShiftRightConcatMod64Uint64x2",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint64x4",
+ name: "ShiftRightConcatMod64Uint64x4",
argLen: 3,
generic: true,
},
{
- name: "ShiftRightConcatUint64x8",
+ name: "ShiftRightConcatMod64Uint64x8",
argLen: 3,
generic: true,
},
@@ -95959,217 +95959,217 @@
generic: true,
},
{
- name: "ShiftAllLeftConcatInt16x8",
+ name: "ShiftAllLeftConcatMod16Int16x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt16x16",
+ name: "ShiftAllLeftConcatMod16Int16x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt16x32",
+ name: "ShiftAllLeftConcatMod16Int16x32",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt32x4",
+ name: "ShiftAllLeftConcatMod16Uint16x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt32x8",
+ name: "ShiftAllLeftConcatMod16Uint16x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt32x16",
+ name: "ShiftAllLeftConcatMod16Uint16x32",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt64x2",
+ name: "ShiftAllLeftConcatMod32Int32x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt64x4",
+ name: "ShiftAllLeftConcatMod32Int32x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatInt64x8",
+ name: "ShiftAllLeftConcatMod32Int32x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint16x8",
+ name: "ShiftAllLeftConcatMod32Uint32x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint16x16",
+ name: "ShiftAllLeftConcatMod32Uint32x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint16x32",
+ name: "ShiftAllLeftConcatMod32Uint32x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint32x4",
+ name: "ShiftAllLeftConcatMod64Int64x2",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint32x8",
+ name: "ShiftAllLeftConcatMod64Int64x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint32x16",
+ name: "ShiftAllLeftConcatMod64Int64x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint64x2",
+ name: "ShiftAllLeftConcatMod64Uint64x2",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint64x4",
+ name: "ShiftAllLeftConcatMod64Uint64x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllLeftConcatUint64x8",
+ name: "ShiftAllLeftConcatMod64Uint64x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt16x8",
+ name: "ShiftAllRightConcatMod16Int16x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt16x16",
+ name: "ShiftAllRightConcatMod16Int16x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt16x32",
+ name: "ShiftAllRightConcatMod16Int16x32",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt32x4",
+ name: "ShiftAllRightConcatMod16Uint16x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt32x8",
+ name: "ShiftAllRightConcatMod16Uint16x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt32x16",
+ name: "ShiftAllRightConcatMod16Uint16x32",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt64x2",
+ name: "ShiftAllRightConcatMod32Int32x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt64x4",
+ name: "ShiftAllRightConcatMod32Int32x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatInt64x8",
+ name: "ShiftAllRightConcatMod32Int32x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint16x8",
+ name: "ShiftAllRightConcatMod32Uint32x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint16x16",
+ name: "ShiftAllRightConcatMod32Uint32x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint16x32",
+ name: "ShiftAllRightConcatMod32Uint32x16",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint32x4",
+ name: "ShiftAllRightConcatMod64Int64x2",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint32x8",
+ name: "ShiftAllRightConcatMod64Int64x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint32x16",
+ name: "ShiftAllRightConcatMod64Int64x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint64x2",
+ name: "ShiftAllRightConcatMod64Uint64x2",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint64x4",
+ name: "ShiftAllRightConcatMod64Uint64x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
- name: "ShiftAllRightConcatUint64x8",
+ name: "ShiftAllRightConcatMod64Uint64x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 648b7b8..d034ba8 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -5290,58 +5290,58 @@
return rewriteValueAMD64_OpSetLoUint8x32(v)
case OpSetLoUint8x64:
return rewriteValueAMD64_OpSetLoUint8x64(v)
- case OpShiftAllLeftConcatInt16x16:
+ case OpShiftAllLeftConcatMod16Int16x16:
v.Op = OpAMD64VPSHLDW256
return true
- case OpShiftAllLeftConcatInt16x32:
+ case OpShiftAllLeftConcatMod16Int16x32:
v.Op = OpAMD64VPSHLDW512
return true
- case OpShiftAllLeftConcatInt16x8:
+ case OpShiftAllLeftConcatMod16Int16x8:
v.Op = OpAMD64VPSHLDW128
return true
- case OpShiftAllLeftConcatInt32x16:
+ case OpShiftAllLeftConcatMod16Uint16x16:
+ v.Op = OpAMD64VPSHLDW256
+ return true
+ case OpShiftAllLeftConcatMod16Uint16x32:
+ v.Op = OpAMD64VPSHLDW512
+ return true
+ case OpShiftAllLeftConcatMod16Uint16x8:
+ v.Op = OpAMD64VPSHLDW128
+ return true
+ case OpShiftAllLeftConcatMod32Int32x16:
v.Op = OpAMD64VPSHLDD512
return true
- case OpShiftAllLeftConcatInt32x4:
+ case OpShiftAllLeftConcatMod32Int32x4:
v.Op = OpAMD64VPSHLDD128
return true
- case OpShiftAllLeftConcatInt32x8:
+ case OpShiftAllLeftConcatMod32Int32x8:
v.Op = OpAMD64VPSHLDD256
return true
- case OpShiftAllLeftConcatInt64x2:
+ case OpShiftAllLeftConcatMod32Uint32x16:
+ v.Op = OpAMD64VPSHLDD512
+ return true
+ case OpShiftAllLeftConcatMod32Uint32x4:
+ v.Op = OpAMD64VPSHLDD128
+ return true
+ case OpShiftAllLeftConcatMod32Uint32x8:
+ v.Op = OpAMD64VPSHLDD256
+ return true
+ case OpShiftAllLeftConcatMod64Int64x2:
v.Op = OpAMD64VPSHLDQ128
return true
- case OpShiftAllLeftConcatInt64x4:
+ case OpShiftAllLeftConcatMod64Int64x4:
v.Op = OpAMD64VPSHLDQ256
return true
- case OpShiftAllLeftConcatInt64x8:
+ case OpShiftAllLeftConcatMod64Int64x8:
v.Op = OpAMD64VPSHLDQ512
return true
- case OpShiftAllLeftConcatUint16x16:
- v.Op = OpAMD64VPSHLDW256
- return true
- case OpShiftAllLeftConcatUint16x32:
- v.Op = OpAMD64VPSHLDW512
- return true
- case OpShiftAllLeftConcatUint16x8:
- v.Op = OpAMD64VPSHLDW128
- return true
- case OpShiftAllLeftConcatUint32x16:
- v.Op = OpAMD64VPSHLDD512
- return true
- case OpShiftAllLeftConcatUint32x4:
- v.Op = OpAMD64VPSHLDD128
- return true
- case OpShiftAllLeftConcatUint32x8:
- v.Op = OpAMD64VPSHLDD256
- return true
- case OpShiftAllLeftConcatUint64x2:
+ case OpShiftAllLeftConcatMod64Uint64x2:
v.Op = OpAMD64VPSHLDQ128
return true
- case OpShiftAllLeftConcatUint64x4:
+ case OpShiftAllLeftConcatMod64Uint64x4:
v.Op = OpAMD64VPSHLDQ256
return true
- case OpShiftAllLeftConcatUint64x8:
+ case OpShiftAllLeftConcatMod64Uint64x8:
v.Op = OpAMD64VPSHLDQ512
return true
case OpShiftAllLeftInt16x16:
@@ -5398,58 +5398,58 @@
case OpShiftAllLeftUint64x8:
v.Op = OpAMD64VPSLLQ512
return true
- case OpShiftAllRightConcatInt16x16:
+ case OpShiftAllRightConcatMod16Int16x16:
v.Op = OpAMD64VPSHRDW256
return true
- case OpShiftAllRightConcatInt16x32:
+ case OpShiftAllRightConcatMod16Int16x32:
v.Op = OpAMD64VPSHRDW512
return true
- case OpShiftAllRightConcatInt16x8:
+ case OpShiftAllRightConcatMod16Int16x8:
v.Op = OpAMD64VPSHRDW128
return true
- case OpShiftAllRightConcatInt32x16:
+ case OpShiftAllRightConcatMod16Uint16x16:
+ v.Op = OpAMD64VPSHRDW256
+ return true
+ case OpShiftAllRightConcatMod16Uint16x32:
+ v.Op = OpAMD64VPSHRDW512
+ return true
+ case OpShiftAllRightConcatMod16Uint16x8:
+ v.Op = OpAMD64VPSHRDW128
+ return true
+ case OpShiftAllRightConcatMod32Int32x16:
v.Op = OpAMD64VPSHRDD512
return true
- case OpShiftAllRightConcatInt32x4:
+ case OpShiftAllRightConcatMod32Int32x4:
v.Op = OpAMD64VPSHRDD128
return true
- case OpShiftAllRightConcatInt32x8:
+ case OpShiftAllRightConcatMod32Int32x8:
v.Op = OpAMD64VPSHRDD256
return true
- case OpShiftAllRightConcatInt64x2:
+ case OpShiftAllRightConcatMod32Uint32x16:
+ v.Op = OpAMD64VPSHRDD512
+ return true
+ case OpShiftAllRightConcatMod32Uint32x4:
+ v.Op = OpAMD64VPSHRDD128
+ return true
+ case OpShiftAllRightConcatMod32Uint32x8:
+ v.Op = OpAMD64VPSHRDD256
+ return true
+ case OpShiftAllRightConcatMod64Int64x2:
v.Op = OpAMD64VPSHRDQ128
return true
- case OpShiftAllRightConcatInt64x4:
+ case OpShiftAllRightConcatMod64Int64x4:
v.Op = OpAMD64VPSHRDQ256
return true
- case OpShiftAllRightConcatInt64x8:
+ case OpShiftAllRightConcatMod64Int64x8:
v.Op = OpAMD64VPSHRDQ512
return true
- case OpShiftAllRightConcatUint16x16:
- v.Op = OpAMD64VPSHRDW256
- return true
- case OpShiftAllRightConcatUint16x32:
- v.Op = OpAMD64VPSHRDW512
- return true
- case OpShiftAllRightConcatUint16x8:
- v.Op = OpAMD64VPSHRDW128
- return true
- case OpShiftAllRightConcatUint32x16:
- v.Op = OpAMD64VPSHRDD512
- return true
- case OpShiftAllRightConcatUint32x4:
- v.Op = OpAMD64VPSHRDD128
- return true
- case OpShiftAllRightConcatUint32x8:
- v.Op = OpAMD64VPSHRDD256
- return true
- case OpShiftAllRightConcatUint64x2:
+ case OpShiftAllRightConcatMod64Uint64x2:
v.Op = OpAMD64VPSHRDQ128
return true
- case OpShiftAllRightConcatUint64x4:
+ case OpShiftAllRightConcatMod64Uint64x4:
v.Op = OpAMD64VPSHRDQ256
return true
- case OpShiftAllRightConcatUint64x8:
+ case OpShiftAllRightConcatMod64Uint64x8:
v.Op = OpAMD64VPSHRDQ512
return true
case OpShiftAllRightInt16x16:
@@ -5506,58 +5506,58 @@
case OpShiftAllRightUint64x8:
v.Op = OpAMD64VPSRLQ512
return true
- case OpShiftLeftConcatInt16x16:
+ case OpShiftLeftConcatMod16Int16x16:
v.Op = OpAMD64VPSHLDVW256
return true
- case OpShiftLeftConcatInt16x32:
+ case OpShiftLeftConcatMod16Int16x32:
v.Op = OpAMD64VPSHLDVW512
return true
- case OpShiftLeftConcatInt16x8:
+ case OpShiftLeftConcatMod16Int16x8:
v.Op = OpAMD64VPSHLDVW128
return true
- case OpShiftLeftConcatInt32x16:
+ case OpShiftLeftConcatMod16Uint16x16:
+ v.Op = OpAMD64VPSHLDVW256
+ return true
+ case OpShiftLeftConcatMod16Uint16x32:
+ v.Op = OpAMD64VPSHLDVW512
+ return true
+ case OpShiftLeftConcatMod16Uint16x8:
+ v.Op = OpAMD64VPSHLDVW128
+ return true
+ case OpShiftLeftConcatMod32Int32x16:
v.Op = OpAMD64VPSHLDVD512
return true
- case OpShiftLeftConcatInt32x4:
+ case OpShiftLeftConcatMod32Int32x4:
v.Op = OpAMD64VPSHLDVD128
return true
- case OpShiftLeftConcatInt32x8:
+ case OpShiftLeftConcatMod32Int32x8:
v.Op = OpAMD64VPSHLDVD256
return true
- case OpShiftLeftConcatInt64x2:
+ case OpShiftLeftConcatMod32Uint32x16:
+ v.Op = OpAMD64VPSHLDVD512
+ return true
+ case OpShiftLeftConcatMod32Uint32x4:
+ v.Op = OpAMD64VPSHLDVD128
+ return true
+ case OpShiftLeftConcatMod32Uint32x8:
+ v.Op = OpAMD64VPSHLDVD256
+ return true
+ case OpShiftLeftConcatMod64Int64x2:
v.Op = OpAMD64VPSHLDVQ128
return true
- case OpShiftLeftConcatInt64x4:
+ case OpShiftLeftConcatMod64Int64x4:
v.Op = OpAMD64VPSHLDVQ256
return true
- case OpShiftLeftConcatInt64x8:
+ case OpShiftLeftConcatMod64Int64x8:
v.Op = OpAMD64VPSHLDVQ512
return true
- case OpShiftLeftConcatUint16x16:
- v.Op = OpAMD64VPSHLDVW256
- return true
- case OpShiftLeftConcatUint16x32:
- v.Op = OpAMD64VPSHLDVW512
- return true
- case OpShiftLeftConcatUint16x8:
- v.Op = OpAMD64VPSHLDVW128
- return true
- case OpShiftLeftConcatUint32x16:
- v.Op = OpAMD64VPSHLDVD512
- return true
- case OpShiftLeftConcatUint32x4:
- v.Op = OpAMD64VPSHLDVD128
- return true
- case OpShiftLeftConcatUint32x8:
- v.Op = OpAMD64VPSHLDVD256
- return true
- case OpShiftLeftConcatUint64x2:
+ case OpShiftLeftConcatMod64Uint64x2:
v.Op = OpAMD64VPSHLDVQ128
return true
- case OpShiftLeftConcatUint64x4:
+ case OpShiftLeftConcatMod64Uint64x4:
v.Op = OpAMD64VPSHLDVQ256
return true
- case OpShiftLeftConcatUint64x8:
+ case OpShiftLeftConcatMod64Uint64x8:
v.Op = OpAMD64VPSHLDVQ512
return true
case OpShiftLeftInt16x16:
@@ -5614,58 +5614,58 @@
case OpShiftLeftUint64x8:
v.Op = OpAMD64VPSLLVQ512
return true
- case OpShiftRightConcatInt16x16:
+ case OpShiftRightConcatMod16Int16x16:
v.Op = OpAMD64VPSHRDVW256
return true
- case OpShiftRightConcatInt16x32:
+ case OpShiftRightConcatMod16Int16x32:
v.Op = OpAMD64VPSHRDVW512
return true
- case OpShiftRightConcatInt16x8:
+ case OpShiftRightConcatMod16Int16x8:
v.Op = OpAMD64VPSHRDVW128
return true
- case OpShiftRightConcatInt32x16:
+ case OpShiftRightConcatMod16Uint16x16:
+ v.Op = OpAMD64VPSHRDVW256
+ return true
+ case OpShiftRightConcatMod16Uint16x32:
+ v.Op = OpAMD64VPSHRDVW512
+ return true
+ case OpShiftRightConcatMod16Uint16x8:
+ v.Op = OpAMD64VPSHRDVW128
+ return true
+ case OpShiftRightConcatMod32Int32x16:
v.Op = OpAMD64VPSHRDVD512
return true
- case OpShiftRightConcatInt32x4:
+ case OpShiftRightConcatMod32Int32x4:
v.Op = OpAMD64VPSHRDVD128
return true
- case OpShiftRightConcatInt32x8:
+ case OpShiftRightConcatMod32Int32x8:
v.Op = OpAMD64VPSHRDVD256
return true
- case OpShiftRightConcatInt64x2:
+ case OpShiftRightConcatMod32Uint32x16:
+ v.Op = OpAMD64VPSHRDVD512
+ return true
+ case OpShiftRightConcatMod32Uint32x4:
+ v.Op = OpAMD64VPSHRDVD128
+ return true
+ case OpShiftRightConcatMod32Uint32x8:
+ v.Op = OpAMD64VPSHRDVD256
+ return true
+ case OpShiftRightConcatMod64Int64x2:
v.Op = OpAMD64VPSHRDVQ128
return true
- case OpShiftRightConcatInt64x4:
+ case OpShiftRightConcatMod64Int64x4:
v.Op = OpAMD64VPSHRDVQ256
return true
- case OpShiftRightConcatInt64x8:
+ case OpShiftRightConcatMod64Int64x8:
v.Op = OpAMD64VPSHRDVQ512
return true
- case OpShiftRightConcatUint16x16:
- v.Op = OpAMD64VPSHRDVW256
- return true
- case OpShiftRightConcatUint16x32:
- v.Op = OpAMD64VPSHRDVW512
- return true
- case OpShiftRightConcatUint16x8:
- v.Op = OpAMD64VPSHRDVW128
- return true
- case OpShiftRightConcatUint32x16:
- v.Op = OpAMD64VPSHRDVD512
- return true
- case OpShiftRightConcatUint32x4:
- v.Op = OpAMD64VPSHRDVD128
- return true
- case OpShiftRightConcatUint32x8:
- v.Op = OpAMD64VPSHRDVD256
- return true
- case OpShiftRightConcatUint64x2:
+ case OpShiftRightConcatMod64Uint64x2:
v.Op = OpAMD64VPSHRDVQ128
return true
- case OpShiftRightConcatUint64x4:
+ case OpShiftRightConcatMod64Uint64x4:
v.Op = OpAMD64VPSHRDVQ256
return true
- case OpShiftRightConcatUint64x8:
+ case OpShiftRightConcatMod64Uint64x8:
v.Op = OpAMD64VPSHRDVQ512
return true
case OpShiftRightInt16x16:
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index 326a7ee..cebed2e 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -992,24 +992,24 @@
addF(simdPackage, "Uint64x2.ShiftAllLeft", opLen2(ssa.OpShiftAllLeftUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.ShiftAllLeft", opLen2(ssa.OpShiftAllLeftUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.ShiftAllLeft", opLen2(ssa.OpShiftAllLeftUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x32.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int32x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int32x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int64x2.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int64x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int64x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatInt64x8, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint16x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x32.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint32x16.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint64x2.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint64x4.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint64x8.ShiftAllLeftConcat", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatUint64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int16x8.ShiftAllLeftConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod16Int16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.ShiftAllLeftConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod16Int16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x32.ShiftAllLeftConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod16Int16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ShiftAllLeftConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod16Uint16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ShiftAllLeftConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod16Uint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ShiftAllLeftConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod16Uint16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int32x4.ShiftAllLeftConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod32Int32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int32x8.ShiftAllLeftConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod32Int32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int32x16.ShiftAllLeftConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod32Int32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ShiftAllLeftConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod32Uint32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ShiftAllLeftConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod32Uint32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ShiftAllLeftConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod32Uint32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int64x2.ShiftAllLeftConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod64Int64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int64x4.ShiftAllLeftConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod64Int64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int64x8.ShiftAllLeftConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod64Int64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ShiftAllLeftConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod64Uint64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ShiftAllLeftConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod64Uint64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ShiftAllLeftConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllLeftConcatMod64Uint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.ShiftAllRight", opLen2(ssa.OpShiftAllRightInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ShiftAllRight", opLen2(ssa.OpShiftAllRightInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ShiftAllRight", opLen2(ssa.OpShiftAllRightInt16x32, types.TypeVec512), sys.AMD64)
@@ -1028,24 +1028,24 @@
addF(simdPackage, "Uint64x2.ShiftAllRight", opLen2(ssa.OpShiftAllRightUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.ShiftAllRight", opLen2(ssa.OpShiftAllRightUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.ShiftAllRight", opLen2(ssa.OpShiftAllRightUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x32.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int32x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int32x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int64x2.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int64x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int64x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatInt64x8, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint16x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x32.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint32x16.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint64x2.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint64x2, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint64x4.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint64x8.ShiftAllRightConcat", opLen2Imm8_2I(ssa.OpShiftAllRightConcatUint64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int16x8.ShiftAllRightConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod16Int16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.ShiftAllRightConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod16Int16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x32.ShiftAllRightConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod16Int16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ShiftAllRightConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod16Uint16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ShiftAllRightConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod16Uint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ShiftAllRightConcatMod16", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod16Uint16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int32x4.ShiftAllRightConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod32Int32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int32x8.ShiftAllRightConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod32Int32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int32x16.ShiftAllRightConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod32Int32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ShiftAllRightConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod32Uint32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ShiftAllRightConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod32Uint32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ShiftAllRightConcatMod32", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod32Uint32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int64x2.ShiftAllRightConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod64Int64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int64x4.ShiftAllRightConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod64Int64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int64x8.ShiftAllRightConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod64Int64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ShiftAllRightConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod64Uint64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ShiftAllRightConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod64Uint64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ShiftAllRightConcatMod64", opLen2Imm8_2I(ssa.OpShiftAllRightConcatMod64Uint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.ShiftLeft", opLen2(ssa.OpShiftLeftInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ShiftLeft", opLen2(ssa.OpShiftLeftInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ShiftLeft", opLen2(ssa.OpShiftLeftInt16x32, types.TypeVec512), sys.AMD64)
@@ -1064,24 +1064,24 @@
addF(simdPackage, "Uint64x2.ShiftLeft", opLen2(ssa.OpShiftLeftUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.ShiftLeft", opLen2(ssa.OpShiftLeftUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.ShiftLeft", opLen2(ssa.OpShiftLeftUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int32x4.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x8.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x16.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int64x2.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x4.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int64x8.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatInt64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint16x8.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x16.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x32.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint32x4.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x8.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x16.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint64x2.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x4.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint64x8.ShiftLeftConcat", opLen3(ssa.OpShiftLeftConcatUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.ShiftLeftConcatMod16", opLen3(ssa.OpShiftLeftConcatMod16Int16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x16.ShiftLeftConcatMod16", opLen3(ssa.OpShiftLeftConcatMod16Int16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x32.ShiftLeftConcatMod16", opLen3(ssa.OpShiftLeftConcatMod16Int16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ShiftLeftConcatMod16", opLen3(ssa.OpShiftLeftConcatMod16Uint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ShiftLeftConcatMod16", opLen3(ssa.OpShiftLeftConcatMod16Uint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ShiftLeftConcatMod16", opLen3(ssa.OpShiftLeftConcatMod16Uint16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.ShiftLeftConcatMod32", opLen3(ssa.OpShiftLeftConcatMod32Int32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.ShiftLeftConcatMod32", opLen3(ssa.OpShiftLeftConcatMod32Int32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.ShiftLeftConcatMod32", opLen3(ssa.OpShiftLeftConcatMod32Int32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ShiftLeftConcatMod32", opLen3(ssa.OpShiftLeftConcatMod32Uint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ShiftLeftConcatMod32", opLen3(ssa.OpShiftLeftConcatMod32Uint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ShiftLeftConcatMod32", opLen3(ssa.OpShiftLeftConcatMod32Uint32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int64x2.ShiftLeftConcatMod64", opLen3(ssa.OpShiftLeftConcatMod64Int64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x4.ShiftLeftConcatMod64", opLen3(ssa.OpShiftLeftConcatMod64Int64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x8.ShiftLeftConcatMod64", opLen3(ssa.OpShiftLeftConcatMod64Int64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ShiftLeftConcatMod64", opLen3(ssa.OpShiftLeftConcatMod64Uint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ShiftLeftConcatMod64", opLen3(ssa.OpShiftLeftConcatMod64Uint64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ShiftLeftConcatMod64", opLen3(ssa.OpShiftLeftConcatMod64Uint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.ShiftRight", opLen2(ssa.OpShiftRightInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ShiftRight", opLen2(ssa.OpShiftRightInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ShiftRight", opLen2(ssa.OpShiftRightInt16x32, types.TypeVec512), sys.AMD64)
@@ -1100,24 +1100,24 @@
addF(simdPackage, "Uint64x2.ShiftRight", opLen2(ssa.OpShiftRightUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.ShiftRight", opLen2(ssa.OpShiftRightUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.ShiftRight", opLen2(ssa.OpShiftRightUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int32x4.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x8.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x16.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int64x2.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x4.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int64x8.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatInt64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint16x8.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x16.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x32.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint32x4.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x8.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x16.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint64x2.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x4.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint64x8.ShiftRightConcat", opLen3(ssa.OpShiftRightConcatUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.ShiftRightConcatMod16", opLen3(ssa.OpShiftRightConcatMod16Int16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x16.ShiftRightConcatMod16", opLen3(ssa.OpShiftRightConcatMod16Int16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x32.ShiftRightConcatMod16", opLen3(ssa.OpShiftRightConcatMod16Int16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ShiftRightConcatMod16", opLen3(ssa.OpShiftRightConcatMod16Uint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ShiftRightConcatMod16", opLen3(ssa.OpShiftRightConcatMod16Uint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ShiftRightConcatMod16", opLen3(ssa.OpShiftRightConcatMod16Uint16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.ShiftRightConcatMod32", opLen3(ssa.OpShiftRightConcatMod32Int32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.ShiftRightConcatMod32", opLen3(ssa.OpShiftRightConcatMod32Int32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.ShiftRightConcatMod32", opLen3(ssa.OpShiftRightConcatMod32Int32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ShiftRightConcatMod32", opLen3(ssa.OpShiftRightConcatMod32Uint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ShiftRightConcatMod32", opLen3(ssa.OpShiftRightConcatMod32Uint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ShiftRightConcatMod32", opLen3(ssa.OpShiftRightConcatMod32Uint32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int64x2.ShiftRightConcatMod64", opLen3(ssa.OpShiftRightConcatMod64Int64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x4.ShiftRightConcatMod64", opLen3(ssa.OpShiftRightConcatMod64Int64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x8.ShiftRightConcatMod64", opLen3(ssa.OpShiftRightConcatMod64Int64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ShiftRightConcatMod64", opLen3(ssa.OpShiftRightConcatMod64Uint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ShiftRightConcatMod64", opLen3(ssa.OpShiftRightConcatMod64Uint64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ShiftRightConcatMod64", opLen3(ssa.OpShiftRightConcatMod64Uint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Sqrt", opLen1(ssa.OpSqrtFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Sqrt", opLen1(ssa.OpSqrtFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Sqrt", opLen1(ssa.OpSqrtFloat32x16, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
index 8e321fe..8658524 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/categories.yaml
@@ -86,15 +86,48 @@
commutative: false
documentation: !string |-
// NAME rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-- go: ShiftAllLeftConcat
+# Bizzare shifts
+- go: ShiftAllLeftConcatMod16
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] left by shift%16, filing any empted lower bits
+ // with the high bits of y[i].
+ //
+ // z[i] = concat(x[i], y[i]) << (shift%16 - 16)
+- go: ShiftAllRightConcatMod16
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] right by shift%16, filling any emptied upper bits
+ // with the low bits of y[i].
+ //
+ // z[i] = concat(y[i], x[i]) >> (shift%16)
+- go: ShiftLeftConcatMod16
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] left by shift[i]%16, filing any empted lower bits
+ // with the high bits of y[i].
+ //
+ // z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
+- go: ShiftRightConcatMod16
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] right by shift[i]%16, filling any emptied upper bits
+ // with the low bits of y[i].
+ //
+ // z[i] = concat(y[i], x[i]) >> (shift[i]%16)
+- go: ShiftAllLeftConcatMod32
nameAndSizeCheck: true
commutative: false
documentation: !string |-
// NAME shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
//
- // z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-- go: ShiftAllRightConcat
+ // z[i] = concat(x[i], y[i]) << (shift%32 - 16)
+- go: ShiftAllRightConcatMod32
nameAndSizeCheck: true
commutative: false
documentation: !string |-
@@ -102,15 +135,15 @@
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
-- go: ShiftLeftConcat
+- go: ShiftLeftConcatMod32
nameAndSizeCheck: true
commutative: false
documentation: !string |-
// NAME shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
- // z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-- go: ShiftRightConcat
+ // z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
+- go: ShiftRightConcatMod32
nameAndSizeCheck: true
commutative: false
documentation: !string |-
@@ -118,3 +151,35 @@
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+- go: ShiftAllLeftConcatMod64
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] left by shift%64, filing any empted lower bits
+ // with the high bits of y[i].
+ //
+ // z[i] = concat(x[i], y[i]) << (shift%64 - 64)
+- go: ShiftAllRightConcatMod64
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] right by shift%64, filling any emptied upper bits
+ // with the low bits of y[i].
+ //
+ // z[i] = concat(y[i], x[i]) >> (shift%64)
+- go: ShiftLeftConcatMod64
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] left by shift[i]%64, filing any empted lower bits
+ // with the high bits of y[i].
+ //
+ // z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
+- go: ShiftRightConcatMod64
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts x[i] right by shift[i]%64, filling any emptied upper bits
+ // with the low bits of y[i].
+ //
+ // z[i] = concat(y[i], x[i]) >> (shift[i]%64)
\ No newline at end of file
diff --git a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml
index ffbc6da..70a106f 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/ShiftRotate/go.yaml
@@ -156,8 +156,8 @@
- *any
# Bizzare shifts.
-- go: ShiftAllLeftConcat
- asm: "VPSHLD[WDQ]"
+- go: ShiftAllLeftConcatMod16
+ asm: "VPSHLDW"
operandOrder: 2I
in:
- *any
@@ -165,8 +165,8 @@
- *pureImm
out:
- *any
-- go: ShiftAllRightConcat
- asm: "VPSHRD[WDQ]"
+- go: ShiftAllRightConcatMod16
+ asm: "VPSHRDW"
operandOrder: 2I
in:
- *any
@@ -174,8 +174,8 @@
- *pureImm
out:
- *any
-- go: ShiftLeftConcat
- asm: "VPSHLDV[WDQ]"
+- go: ShiftLeftConcatMod16
+ asm: "VPSHLDVW"
in:
- go: $t
- go: $t
@@ -183,8 +183,80 @@
name: shift
out:
- go: $t
-- go: ShiftRightConcat
- asm: "VPSHRDV[WDQ]"
+- go: ShiftRightConcatMod16
+ asm: "VPSHRDVW"
+ in:
+ - go: $t
+ - go: $t
+ - base: uint
+ name: shift
+ out:
+ - go: $t
+- go: ShiftAllLeftConcatMod32
+ asm: "VPSHLDD"
+ operandOrder: 2I
+ in:
+ - *any
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: ShiftAllRightConcatMod32
+ asm: "VPSHRDD"
+ operandOrder: 2I
+ in:
+ - *any
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: ShiftLeftConcatMod32
+ asm: "VPSHLDVD"
+ in:
+ - go: $t
+ - go: $t
+ - base: uint
+ name: shift
+ out:
+ - go: $t
+- go: ShiftRightConcatMod32
+ asm: "VPSHRDVD"
+ in:
+ - go: $t
+ - go: $t
+ - base: uint
+ name: shift
+ out:
+ - go: $t
+- go: ShiftAllLeftConcatMod64
+ asm: "VPSHLDQ"
+ operandOrder: 2I
+ in:
+ - *any
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: ShiftAllRightConcatMod64
+ asm: "VPSHRDQ"
+ operandOrder: 2I
+ in:
+ - *any
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: ShiftLeftConcatMod64
+ asm: "VPSHLDVQ"
+ in:
+ - go: $t
+ - go: $t
+ - base: uint
+ name: shift
+ out:
+ - go: $t
+- go: ShiftRightConcatMod64
+ asm: "VPSHRDVQ"
in:
- go: $t
- go: $t
diff --git a/src/simd/archsimd/internal/simd_test/shift_test.go b/src/simd/archsimd/internal/simd_test/shift_test.go
index f3ab9d9..2db4315 100644
--- a/src/simd/archsimd/internal/simd_test/shift_test.go
+++ b/src/simd/archsimd/internal/simd_test/shift_test.go
@@ -109,20 +109,20 @@
}
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcat(y, 2) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcatMod32(y, 2) },
map2(salc(2)))
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcat(y, hide(2)) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcatMod32(y, hide(2)) },
map2(salc(hide(2))))
// TODO: If we expand the shift from uint8, add larger cases (e.g., 0x1000).
// The uint8 conversion is only here so the build will fail if we change it.
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcat(y, uint8(128)) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcatMod32(y, uint8(128)) },
map2(salc(128)))
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcat(y, hide(128)) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllLeftConcatMod32(y, hide(128)) },
map2(salc(hide(128))))
// Signed ShiftAllRightConcat
@@ -133,17 +133,17 @@
}
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcat(y, 2) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcatMod32(y, 2) },
map2(sarc(2)))
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcat(y, hide(2)) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcatMod32(y, hide(2)) },
map2(sarc(hide(2))))
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcat(y, uint8(128)) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcatMod32(y, uint8(128)) },
map2(sarc(128)))
testInt32x4Binary(t,
- func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcat(y, hide(128)) },
+ func(x, y archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftAllRightConcatMod32(y, hide(128)) },
map2(sarc(hide(128))))
// Unsigned ShiftAllRightConcat
@@ -154,17 +154,17 @@
}
testUint32x4Binary(t,
- func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcat(y, 2) },
+ func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcatMod32(y, 2) },
map2(usarc(2)))
testUint32x4Binary(t,
- func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcat(y, hide(2)) },
+ func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcatMod32(y, hide(2)) },
map2(usarc(hide(2))))
testUint32x4Binary(t,
- func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcat(y, uint8(128)) },
+ func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcatMod32(y, uint8(128)) },
map2(usarc(128)))
testUint32x4Binary(t,
- func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcat(y, hide(128)) },
+ func(x, y archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftAllRightConcatMod32(y, hide(128)) },
map2(usarc(hide(128))))
}
@@ -176,19 +176,19 @@
// Note that unlike their non-Concat counterparts, these wrap the shift count.
testInt32x4Ternary(t,
- func(x, y, z archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftLeftConcat(y, z.AsUint32x4()) },
+ func(x, y, z archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftLeftConcatMod32(y, z.AsUint32x4()) },
map3(func(x, y, z int32) int32 {
return int32(concatInt32s(x, y) >> (32 - uint32(z)%32))
}))
testInt32x4Ternary(t,
- func(x, y, z archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftRightConcat(y, z.AsUint32x4()) },
+ func(x, y, z archsimd.Int32x4) archsimd.Int32x4 { return x.ShiftRightConcatMod32(y, z.AsUint32x4()) },
map3(func(x, y, z int32) int32 {
return int32(concatInt32s(y, x) >> (uint32(z) % 32))
}))
testUint32x4Ternary(t,
- func(x, y, z archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftRightConcat(y, z) },
+ func(x, y, z archsimd.Uint32x4) archsimd.Uint32x4 { return x.ShiftRightConcatMod32(y, z) },
map3(func(x, y, z uint32) uint32 {
return uint32(concatUint32s(y, x) >> (z % 32))
}))
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index 5c03258..2e61806 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -5984,187 +5984,191 @@
// Asm: VPSLLQ, CPU Feature: AVX512
func (x Uint64x8) ShiftAllLeft(shift uint64) Uint64x8
-/* ShiftAllLeftConcat */
+/* ShiftAllLeftConcatMod16 */
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod16 shifts x[i] left by shift%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%16 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftAllLeftConcat(y Int16x8, shift uint8) Int16x8
+func (x Int16x8) ShiftAllLeftConcatMod16(y Int16x8, shift uint8) Int16x8
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod16 shifts x[i] left by shift%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%16 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftAllLeftConcat(y Int16x16, shift uint8) Int16x16
+func (x Int16x16) ShiftAllLeftConcatMod16(y Int16x16, shift uint8) Int16x16
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod16 shifts x[i] left by shift%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%16 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftAllLeftConcat(y Int16x32, shift uint8) Int16x32
+func (x Int16x32) ShiftAllLeftConcatMod16(y Int16x32, shift uint8) Int16x32
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod16 shifts x[i] left by shift%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftAllLeftConcat(y Int32x4, shift uint8) Int32x4
-
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftAllLeftConcat(y Int32x8, shift uint8) Int32x8
-
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftAllLeftConcat(y Int32x16, shift uint8) Int32x16
-
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftAllLeftConcat(y Int64x2, shift uint8) Int64x2
-
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftAllLeftConcat(y Int64x4, shift uint8) Int64x4
-
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftAllLeftConcat(y Int64x8, shift uint8) Int64x8
-
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%16 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftAllLeftConcat(y Uint16x8, shift uint8) Uint16x8
+func (x Uint16x8) ShiftAllLeftConcatMod16(y Uint16x8, shift uint8) Uint16x8
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod16 shifts x[i] left by shift%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%16 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftAllLeftConcat(y Uint16x16, shift uint8) Uint16x16
+func (x Uint16x16) ShiftAllLeftConcatMod16(y Uint16x16, shift uint8) Uint16x16
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod16 shifts x[i] left by shift%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%16 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftAllLeftConcat(y Uint16x32, shift uint8) Uint16x32
+func (x Uint16x32) ShiftAllLeftConcatMod16(y Uint16x32, shift uint8) Uint16x32
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+/* ShiftAllLeftConcatMod32 */
+
+// ShiftAllLeftConcatMod32 shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%32 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftAllLeftConcat(y Uint32x4, shift uint8) Uint32x4
+func (x Int32x4) ShiftAllLeftConcatMod32(y Int32x4, shift uint8) Int32x4
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod32 shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%32 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftAllLeftConcat(y Uint32x8, shift uint8) Uint32x8
+func (x Int32x8) ShiftAllLeftConcatMod32(y Int32x8, shift uint8) Int32x8
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod32 shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%32 - 16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftAllLeftConcat(y Uint32x16, shift uint8) Uint32x16
+func (x Int32x16) ShiftAllLeftConcatMod32(y Int32x16, shift uint8) Int32x16
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod32 shifts x[i] left by shift%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%32 - 16)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Uint32x4) ShiftAllLeftConcatMod32(y Uint32x4, shift uint8) Uint32x4
+
+// ShiftAllLeftConcatMod32 shifts x[i] left by shift%32, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift%32 - 16)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Uint32x8) ShiftAllLeftConcatMod32(y Uint32x8, shift uint8) Uint32x8
+
+// ShiftAllLeftConcatMod32 shifts x[i] left by shift%32, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift%32 - 16)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Uint32x16) ShiftAllLeftConcatMod32(y Uint32x16, shift uint8) Uint32x16
+
+/* ShiftAllLeftConcatMod64 */
+
+// ShiftAllLeftConcatMod64 shifts x[i] left by shift%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift%64 - 64)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftAllLeftConcat(y Uint64x2, shift uint8) Uint64x2
+func (x Int64x2) ShiftAllLeftConcatMod64(y Int64x2, shift uint8) Int64x2
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod64 shifts x[i] left by shift%64, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%64 - 64)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftAllLeftConcat(y Uint64x4, shift uint8) Uint64x4
+func (x Int64x4) ShiftAllLeftConcatMod64(y Int64x4, shift uint8) Int64x4
-// ShiftAllLeftConcat shifts x[i] left by shift%32, filing any empted lower bits
+// ShiftAllLeftConcatMod64 shifts x[i] left by shift%64, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift%64 - 64)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftAllLeftConcat(y Uint64x8, shift uint8) Uint64x8
+func (x Int64x8) ShiftAllLeftConcatMod64(y Int64x8, shift uint8) Int64x8
+
+// ShiftAllLeftConcatMod64 shifts x[i] left by shift%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift%64 - 64)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftAllLeftConcatMod64(y Uint64x2, shift uint8) Uint64x2
+
+// ShiftAllLeftConcatMod64 shifts x[i] left by shift%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift%64 - 64)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftAllLeftConcatMod64(y Uint64x4, shift uint8) Uint64x4
+
+// ShiftAllLeftConcatMod64 shifts x[i] left by shift%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift%64 - 64)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftAllLeftConcatMod64(y Uint64x8, shift uint8) Uint64x8
/* ShiftAllRight */
@@ -6276,39 +6280,71 @@
// Asm: VPSRLQ, CPU Feature: AVX512
func (x Uint64x8) ShiftAllRight(shift uint64) Uint64x8
-/* ShiftAllRightConcat */
+/* ShiftAllRightConcatMod16 */
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod16 shifts x[i] right by shift%16, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
+// z[i] = concat(y[i], x[i]) >> (shift%16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftAllRightConcat(y Int16x8, shift uint8) Int16x8
+func (x Int16x8) ShiftAllRightConcatMod16(y Int16x8, shift uint8) Int16x8
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod16 shifts x[i] right by shift%16, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
+// z[i] = concat(y[i], x[i]) >> (shift%16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftAllRightConcat(y Int16x16, shift uint8) Int16x16
+func (x Int16x16) ShiftAllRightConcatMod16(y Int16x16, shift uint8) Int16x16
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod16 shifts x[i] right by shift%16, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
+// z[i] = concat(y[i], x[i]) >> (shift%16)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftAllRightConcat(y Int16x32, shift uint8) Int16x32
+func (x Int16x32) ShiftAllRightConcatMod16(y Int16x32, shift uint8) Int16x32
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod16 shifts x[i] right by shift%16, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift%16)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) ShiftAllRightConcatMod16(y Uint16x8, shift uint8) Uint16x8
+
+// ShiftAllRightConcatMod16 shifts x[i] right by shift%16, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift%16)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) ShiftAllRightConcatMod16(y Uint16x16, shift uint8) Uint16x16
+
+// ShiftAllRightConcatMod16 shifts x[i] right by shift%16, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift%16)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) ShiftAllRightConcatMod16(y Uint16x32, shift uint8) Uint16x32
+
+/* ShiftAllRightConcatMod32 */
+
+// ShiftAllRightConcatMod32 shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
@@ -6316,9 +6352,9 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftAllRightConcat(y Int32x4, shift uint8) Int32x4
+func (x Int32x4) ShiftAllRightConcatMod32(y Int32x4, shift uint8) Int32x4
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod32 shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
@@ -6326,9 +6362,9 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftAllRightConcat(y Int32x8, shift uint8) Int32x8
+func (x Int32x8) ShiftAllRightConcatMod32(y Int32x8, shift uint8) Int32x8
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod32 shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
@@ -6336,69 +6372,9 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftAllRightConcat(y Int32x16, shift uint8) Int32x16
+func (x Int32x16) ShiftAllRightConcatMod32(y Int32x16, shift uint8) Int32x16
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftAllRightConcat(y Int64x2, shift uint8) Int64x2
-
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftAllRightConcat(y Int64x4, shift uint8) Int64x4
-
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftAllRightConcat(y Int64x8, shift uint8) Int64x8
-
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftAllRightConcat(y Uint16x8, shift uint8) Uint16x8
-
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftAllRightConcat(y Uint16x16, shift uint8) Uint16x16
-
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
-//
-// A non-constant value of shift may result in significantly worse performance for this operation.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftAllRightConcat(y Uint16x32, shift uint8) Uint16x32
-
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod32 shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
@@ -6406,9 +6382,9 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftAllRightConcat(y Uint32x4, shift uint8) Uint32x4
+func (x Uint32x4) ShiftAllRightConcatMod32(y Uint32x4, shift uint8) Uint32x4
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod32 shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
@@ -6416,9 +6392,9 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftAllRightConcat(y Uint32x8, shift uint8) Uint32x8
+func (x Uint32x8) ShiftAllRightConcatMod32(y Uint32x8, shift uint8) Uint32x8
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod32 shifts x[i] right by shift%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift%32)
@@ -6426,37 +6402,69 @@
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftAllRightConcat(y Uint32x16, shift uint8) Uint32x16
+func (x Uint32x16) ShiftAllRightConcatMod32(y Uint32x16, shift uint8) Uint32x16
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+/* ShiftAllRightConcatMod64 */
+
+// ShiftAllRightConcatMod64 shifts x[i] right by shift%64, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
+// z[i] = concat(y[i], x[i]) >> (shift%64)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftAllRightConcat(y Uint64x2, shift uint8) Uint64x2
+func (x Int64x2) ShiftAllRightConcatMod64(y Int64x2, shift uint8) Int64x2
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod64 shifts x[i] right by shift%64, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
+// z[i] = concat(y[i], x[i]) >> (shift%64)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftAllRightConcat(y Uint64x4, shift uint8) Uint64x4
+func (x Int64x4) ShiftAllRightConcatMod64(y Int64x4, shift uint8) Int64x4
-// ShiftAllRightConcat shifts x[i] right by shift%32, filling any emptied upper bits
+// ShiftAllRightConcatMod64 shifts x[i] right by shift%64, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift%32)
+// z[i] = concat(y[i], x[i]) >> (shift%64)
//
// A non-constant value of shift may result in significantly worse performance for this operation.
//
// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftAllRightConcat(y Uint64x8, shift uint8) Uint64x8
+func (x Int64x8) ShiftAllRightConcatMod64(y Int64x8, shift uint8) Int64x8
+
+// ShiftAllRightConcatMod64 shifts x[i] right by shift%64, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift%64)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftAllRightConcatMod64(y Uint64x2, shift uint8) Uint64x2
+
+// ShiftAllRightConcatMod64 shifts x[i] right by shift%64, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift%64)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftAllRightConcatMod64(y Uint64x4, shift uint8) Uint64x4
+
+// ShiftAllRightConcatMod64 shifts x[i] right by shift%64, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift%64)
+//
+// A non-constant value of shift may result in significantly worse performance for this operation.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftAllRightConcatMod64(y Uint64x8, shift uint8) Uint64x8
/* ShiftLeft */
@@ -6568,151 +6576,155 @@
// Asm: VPSLLVQ, CPU Feature: AVX512
func (x Uint64x8) ShiftLeft(shift Uint64x8) Uint64x8
-/* ShiftLeftConcat */
+/* ShiftLeftConcatMod16 */
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod16 shifts x[i] left by shift[i]%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
//
// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftLeftConcat(y Int16x8, shift Uint16x8) Int16x8
+func (x Int16x8) ShiftLeftConcatMod16(y Int16x8, shift Uint16x8) Int16x8
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod16 shifts x[i] left by shift[i]%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
//
// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftLeftConcat(y Int16x16, shift Uint16x16) Int16x16
+func (x Int16x16) ShiftLeftConcatMod16(y Int16x16, shift Uint16x16) Int16x16
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod16 shifts x[i] left by shift[i]%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
//
// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftLeftConcat(y Int16x32, shift Uint16x32) Int16x32
+func (x Int16x32) ShiftLeftConcatMod16(y Int16x32, shift Uint16x32) Int16x32
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod16 shifts x[i] left by shift[i]%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftLeftConcat(y Int32x4, shift Uint32x4) Int32x4
-
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftLeftConcat(y Int32x8, shift Uint32x8) Int32x8
-
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftLeftConcat(y Int32x16, shift Uint32x16) Int32x16
-
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftLeftConcat(y Int64x2, shift Uint64x2) Int64x2
-
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftLeftConcat(y Int64x4, shift Uint64x4) Int64x4
-
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftLeftConcat(y Int64x8, shift Uint64x8) Int64x8
-
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
-// with the high bits of y[i].
-//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
//
// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftLeftConcat(y Uint16x8, shift Uint16x8) Uint16x8
+func (x Uint16x8) ShiftLeftConcatMod16(y Uint16x8, shift Uint16x8) Uint16x8
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod16 shifts x[i] left by shift[i]%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
//
// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftLeftConcat(y Uint16x16, shift Uint16x16) Uint16x16
+func (x Uint16x16) ShiftLeftConcatMod16(y Uint16x16, shift Uint16x16) Uint16x16
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod16 shifts x[i] left by shift[i]%16, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%16 - 16)
//
// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftLeftConcat(y Uint16x32, shift Uint16x32) Uint16x32
+func (x Uint16x32) ShiftLeftConcatMod16(y Uint16x32, shift Uint16x32) Uint16x32
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+/* ShiftLeftConcatMod32 */
+
+// ShiftLeftConcatMod32 shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
//
// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftLeftConcat(y Uint32x4, shift Uint32x4) Uint32x4
+func (x Int32x4) ShiftLeftConcatMod32(y Int32x4, shift Uint32x4) Int32x4
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod32 shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
//
// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftLeftConcat(y Uint32x8, shift Uint32x8) Uint32x8
+func (x Int32x8) ShiftLeftConcatMod32(y Int32x8, shift Uint32x8) Int32x8
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod32 shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
//
// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftLeftConcat(y Uint32x16, shift Uint32x16) Uint32x16
+func (x Int32x16) ShiftLeftConcatMod32(y Int32x16, shift Uint32x16) Int32x16
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod32 shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftLeftConcat(y Uint64x2, shift Uint64x2) Uint64x2
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x4) ShiftLeftConcatMod32(y Uint32x4, shift Uint32x4) Uint32x4
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod32 shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftLeftConcat(y Uint64x4, shift Uint64x4) Uint64x4
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x8) ShiftLeftConcatMod32(y Uint32x8, shift Uint32x8) Uint32x8
-// ShiftLeftConcat shifts x[i] left by shift[i]%32, filing any empted lower bits
+// ShiftLeftConcatMod32 shifts x[i] left by shift[i]%32, filing any empted lower bits
// with the high bits of y[i].
//
-// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 32)
+// z[i] = concat(x[i], y[i]) << (shift[i]%32 - 16)
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x16) ShiftLeftConcatMod32(y Uint32x16, shift Uint32x16) Uint32x16
+
+/* ShiftLeftConcatMod64 */
+
+// ShiftLeftConcatMod64 shifts x[i] left by shift[i]%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
//
// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftLeftConcat(y Uint64x8, shift Uint64x8) Uint64x8
+func (x Int64x2) ShiftLeftConcatMod64(y Int64x2, shift Uint64x2) Int64x2
+
+// ShiftLeftConcatMod64 shifts x[i] left by shift[i]%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x4) ShiftLeftConcatMod64(y Int64x4, shift Uint64x4) Int64x4
+
+// ShiftLeftConcatMod64 shifts x[i] left by shift[i]%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x8) ShiftLeftConcatMod64(y Int64x8, shift Uint64x8) Int64x8
+
+// ShiftLeftConcatMod64 shifts x[i] left by shift[i]%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftLeftConcatMod64(y Uint64x2, shift Uint64x2) Uint64x2
+
+// ShiftLeftConcatMod64 shifts x[i] left by shift[i]%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftLeftConcatMod64(y Uint64x4, shift Uint64x4) Uint64x4
+
+// ShiftLeftConcatMod64 shifts x[i] left by shift[i]%64, filing any empted lower bits
+// with the high bits of y[i].
+//
+// z[i] = concat(x[i], y[i]) << (shift[i]%64 - 64)
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftLeftConcatMod64(y Uint64x8, shift Uint64x8) Uint64x8
/* ShiftRight */
@@ -6824,151 +6836,155 @@
// Asm: VPSRLVQ, CPU Feature: AVX512
func (x Uint64x8) ShiftRight(shift Uint64x8) Uint64x8
-/* ShiftRightConcat */
+/* ShiftRightConcatMod16 */
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod16 shifts x[i] right by shift[i]%16, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+// z[i] = concat(y[i], x[i]) >> (shift[i]%16)
//
// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftRightConcat(y Int16x8, shift Uint16x8) Int16x8
+func (x Int16x8) ShiftRightConcatMod16(y Int16x8, shift Uint16x8) Int16x8
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod16 shifts x[i] right by shift[i]%16, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+// z[i] = concat(y[i], x[i]) >> (shift[i]%16)
//
// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftRightConcat(y Int16x16, shift Uint16x16) Int16x16
+func (x Int16x16) ShiftRightConcatMod16(y Int16x16, shift Uint16x16) Int16x16
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod16 shifts x[i] right by shift[i]%16, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+// z[i] = concat(y[i], x[i]) >> (shift[i]%16)
//
// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftRightConcat(y Int16x32, shift Uint16x32) Int16x32
+func (x Int16x32) ShiftRightConcatMod16(y Int16x32, shift Uint16x32) Int16x32
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod16 shifts x[i] right by shift[i]%16, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift[i]%16)
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) ShiftRightConcatMod16(y Uint16x8, shift Uint16x8) Uint16x8
+
+// ShiftRightConcatMod16 shifts x[i] right by shift[i]%16, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift[i]%16)
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) ShiftRightConcatMod16(y Uint16x16, shift Uint16x16) Uint16x16
+
+// ShiftRightConcatMod16 shifts x[i] right by shift[i]%16, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift[i]%16)
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) ShiftRightConcatMod16(y Uint16x32, shift Uint16x32) Uint16x32
+
+/* ShiftRightConcatMod32 */
+
+// ShiftRightConcatMod32 shifts x[i] right by shift[i]%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
//
// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftRightConcat(y Int32x4, shift Uint32x4) Int32x4
+func (x Int32x4) ShiftRightConcatMod32(y Int32x4, shift Uint32x4) Int32x4
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod32 shifts x[i] right by shift[i]%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
//
// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftRightConcat(y Int32x8, shift Uint32x8) Int32x8
+func (x Int32x8) ShiftRightConcatMod32(y Int32x8, shift Uint32x8) Int32x8
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod32 shifts x[i] right by shift[i]%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
//
// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftRightConcat(y Int32x16, shift Uint32x16) Int32x16
+func (x Int32x16) ShiftRightConcatMod32(y Int32x16, shift Uint32x16) Int32x16
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftRightConcat(y Int64x2, shift Uint64x2) Int64x2
-
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftRightConcat(y Int64x4, shift Uint64x4) Int64x4
-
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftRightConcat(y Int64x8, shift Uint64x8) Int64x8
-
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftRightConcat(y Uint16x8, shift Uint16x8) Uint16x8
-
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftRightConcat(y Uint16x16, shift Uint16x16) Uint16x16
-
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
-// with the low bits of y[i].
-//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftRightConcat(y Uint16x32, shift Uint16x32) Uint16x32
-
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod32 shifts x[i] right by shift[i]%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
//
// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftRightConcat(y Uint32x4, shift Uint32x4) Uint32x4
+func (x Uint32x4) ShiftRightConcatMod32(y Uint32x4, shift Uint32x4) Uint32x4
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod32 shifts x[i] right by shift[i]%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
//
// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftRightConcat(y Uint32x8, shift Uint32x8) Uint32x8
+func (x Uint32x8) ShiftRightConcatMod32(y Uint32x8, shift Uint32x8) Uint32x8
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod32 shifts x[i] right by shift[i]%32, filling any emptied upper bits
// with the low bits of y[i].
//
// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
//
// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftRightConcat(y Uint32x16, shift Uint32x16) Uint32x16
+func (x Uint32x16) ShiftRightConcatMod32(y Uint32x16, shift Uint32x16) Uint32x16
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+/* ShiftRightConcatMod64 */
+
+// ShiftRightConcatMod64 shifts x[i] right by shift[i]%64, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+// z[i] = concat(y[i], x[i]) >> (shift[i]%64)
//
// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftRightConcat(y Uint64x2, shift Uint64x2) Uint64x2
+func (x Int64x2) ShiftRightConcatMod64(y Int64x2, shift Uint64x2) Int64x2
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod64 shifts x[i] right by shift[i]%64, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+// z[i] = concat(y[i], x[i]) >> (shift[i]%64)
//
// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftRightConcat(y Uint64x4, shift Uint64x4) Uint64x4
+func (x Int64x4) ShiftRightConcatMod64(y Int64x4, shift Uint64x4) Int64x4
-// ShiftRightConcat shifts x[i] right by shift[i]%32, filling any emptied upper bits
+// ShiftRightConcatMod64 shifts x[i] right by shift[i]%64, filling any emptied upper bits
// with the low bits of y[i].
//
-// z[i] = concat(y[i], x[i]) >> (shift[i]%32)
+// z[i] = concat(y[i], x[i]) >> (shift[i]%64)
//
// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftRightConcat(y Uint64x8, shift Uint64x8) Uint64x8
+func (x Int64x8) ShiftRightConcatMod64(y Int64x8, shift Uint64x8) Int64x8
+
+// ShiftRightConcatMod64 shifts x[i] right by shift[i]%64, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift[i]%64)
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftRightConcatMod64(y Uint64x2, shift Uint64x2) Uint64x2
+
+// ShiftRightConcatMod64 shifts x[i] right by shift[i]%64, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift[i]%64)
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftRightConcatMod64(y Uint64x4, shift Uint64x4) Uint64x4
+
+// ShiftRightConcatMod64 shifts x[i] right by shift[i]%64, filling any emptied upper bits
+// with the low bits of y[i].
+//
+// z[i] = concat(y[i], x[i]) >> (shift[i]%64)
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftRightConcatMod64(y Uint64x8, shift Uint64x8) Uint64x8
/* Sqrt */