[go/dev.simd] [dev.simd] simd, cmd/compile: rename (Add|Sub)Pairs* to Concat(Add|Sub)Pairs*

1 view

Skip to first unread message

Junyang Shao (Gerrit)

unread,

5:00 PM (6 hours ago) 5:00 PM

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Junyang Shao has uploaded the change for review

Commit message

[dev.simd] simd, cmd/compile: rename (Add|Sub)Pairs* to Concat(Add|Sub)Pairs*

For #78979.

Change-Id: I7a3f6fff272bc7630c116dc265d1677c0187d12a

Change diff

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 618a7ad..402e711 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -249,16 +249,6 @@
 		ssa.OpAMD64VPADDQ128,
 		ssa.OpAMD64VPADDQ256,
 		ssa.OpAMD64VPADDQ512,
-		ssa.OpAMD64VHADDPS128,
-		ssa.OpAMD64VHADDPD128,
-		ssa.OpAMD64VPHADDW128,
-		ssa.OpAMD64VPHADDD128,
-		ssa.OpAMD64VHADDPS256,
-		ssa.OpAMD64VHADDPD256,
-		ssa.OpAMD64VPHADDW256,
-		ssa.OpAMD64VPHADDD256,
-		ssa.OpAMD64VPHADDSW128,
-		ssa.OpAMD64VPHADDSW256,
 		ssa.OpAMD64VPADDSB128,
 		ssa.OpAMD64VPADDSB256,
 		ssa.OpAMD64VPADDSB512,
@@ -289,6 +279,26 @@
 		ssa.OpAMD64VPAVGW128,
 		ssa.OpAMD64VPAVGW256,
 		ssa.OpAMD64VPAVGW512,
+		ssa.OpAMD64VHADDPS128,
+		ssa.OpAMD64VHADDPD128,
+		ssa.OpAMD64VPHADDW128,
+		ssa.OpAMD64VPHADDD128,
+		ssa.OpAMD64VHADDPS256,
+		ssa.OpAMD64VHADDPD256,
+		ssa.OpAMD64VPHADDW256,
+		ssa.OpAMD64VPHADDD256,
+		ssa.OpAMD64VPHADDSW128,
+		ssa.OpAMD64VPHADDSW256,
+		ssa.OpAMD64VHSUBPS128,
+		ssa.OpAMD64VHSUBPD128,
+		ssa.OpAMD64VPHSUBW128,
+		ssa.OpAMD64VPHSUBD128,
+		ssa.OpAMD64VHSUBPS256,
+		ssa.OpAMD64VHSUBPD256,
+		ssa.OpAMD64VPHSUBW256,
+		ssa.OpAMD64VPHSUBD256,
+		ssa.OpAMD64VPHSUBSW128,
+		ssa.OpAMD64VPHSUBSW256,
 		ssa.OpAMD64VDIVPS128,
 		ssa.OpAMD64VDIVPS256,
 		ssa.OpAMD64VDIVPS512,
@@ -519,16 +529,6 @@
 		ssa.OpAMD64VPSUBQ128,
 		ssa.OpAMD64VPSUBQ256,
 		ssa.OpAMD64VPSUBQ512,
-		ssa.OpAMD64VHSUBPS128,
-		ssa.OpAMD64VHSUBPD128,
-		ssa.OpAMD64VPHSUBW128,
-		ssa.OpAMD64VPHSUBD128,
-		ssa.OpAMD64VHSUBPS256,
-		ssa.OpAMD64VHSUBPD256,
-		ssa.OpAMD64VPHSUBW256,
-		ssa.OpAMD64VPHSUBD256,
-		ssa.OpAMD64VPHSUBSW128,
-		ssa.OpAMD64VPHSUBSW256,
 		ssa.OpAMD64VPSUBSB128,
 		ssa.OpAMD64VPSUBSB256,
 		ssa.OpAMD64VPSUBSB512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 8fdf860..cdbedbc 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -56,20 +56,6 @@
 (AddUint64x2 ...) => (VPADDQ128 ...)
 (AddUint64x4 ...) => (VPADDQ256 ...)
 (AddUint64x8 ...) => (VPADDQ512 ...)
-(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
-(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
-(AddPairsInt16x8 ...) => (VPHADDW128 ...)
-(AddPairsInt32x4 ...) => (VPHADDD128 ...)
-(AddPairsUint16x8 ...) => (VPHADDW128 ...)
-(AddPairsUint32x4 ...) => (VPHADDD128 ...)
-(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
-(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
-(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
-(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
-(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
-(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
-(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
-(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
 (AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
 (AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
 (AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
@@ -186,6 +172,20 @@
 (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatAddPairsFloat32x4 ...) => (VHADDPS128 ...)
+(ConcatAddPairsFloat64x2 ...) => (VHADDPD128 ...)
+(ConcatAddPairsInt16x8 ...) => (VPHADDW128 ...)
+(ConcatAddPairsInt32x4 ...) => (VPHADDD128 ...)
+(ConcatAddPairsUint16x8 ...) => (VPHADDW128 ...)
+(ConcatAddPairsUint32x4 ...) => (VPHADDD128 ...)
+(ConcatAddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
+(ConcatAddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
+(ConcatAddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
+(ConcatAddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
+(ConcatAddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
+(ConcatAddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
+(ConcatAddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
+(ConcatAddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
 (ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
 (ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
 (ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
@@ -219,6 +219,20 @@
 (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
 (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
 (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
+(ConcatSubPairsFloat32x4 ...) => (VHSUBPS128 ...)
+(ConcatSubPairsFloat64x2 ...) => (VHSUBPD128 ...)
+(ConcatSubPairsInt16x8 ...) => (VPHSUBW128 ...)
+(ConcatSubPairsInt32x4 ...) => (VPHSUBD128 ...)
+(ConcatSubPairsUint16x8 ...) => (VPHSUBW128 ...)
+(ConcatSubPairsUint32x4 ...) => (VPHSUBD128 ...)
+(ConcatSubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
+(ConcatSubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
+(ConcatSubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
+(ConcatSubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
+(ConcatSubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
+(ConcatSubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
+(ConcatSubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
+(ConcatSubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
 (ConvertToFloat32Float64x2 ...) => (VCVTPD2PSX128 ...)
 (ConvertToFloat32Float64x4 ...) => (VCVTPD2PSY128 ...)
 (ConvertToFloat32Float64x8 ...) => (VCVTPD2PS256 ...)
@@ -1198,20 +1212,6 @@
 (SubUint64x2 ...) => (VPSUBQ128 ...)
 (SubUint64x4 ...) => (VPSUBQ256 ...)
 (SubUint64x8 ...) => (VPSUBQ512 ...)
-(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
-(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
-(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
-(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
-(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
-(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
-(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
-(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
-(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
-(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
-(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
-(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
-(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
-(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
 (SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
 (SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
 (SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index f1d87ab..3644bbc 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -47,20 +47,6 @@
 		{name: "AddInt64x2", argLength: 2, commutative: true},                           // ARCH:amd64
 		{name: "AddInt64x4", argLength: 2, commutative: true},                           // ARCH:amd64
 		{name: "AddInt64x8", argLength: 2, commutative: true},                           // ARCH:amd64
-		{name: "AddPairsFloat32x4", argLength: 2},                                       // ARCH:amd64
-		{name: "AddPairsFloat64x2", argLength: 2},                                       // ARCH:amd64
-		{name: "AddPairsGroupedFloat32x8", argLength: 2},                                // ARCH:amd64
-		{name: "AddPairsGroupedFloat64x4", argLength: 2},                                // ARCH:amd64
-		{name: "AddPairsGroupedInt16x16", argLength: 2},                                 // ARCH:amd64
-		{name: "AddPairsGroupedInt32x8", argLength: 2},                                  // ARCH:amd64
-		{name: "AddPairsGroupedUint16x16", argLength: 2},                                // ARCH:amd64
-		{name: "AddPairsGroupedUint32x8", argLength: 2},                                 // ARCH:amd64
-		{name: "AddPairsInt16x8", argLength: 2},                                         // ARCH:amd64
-		{name: "AddPairsInt32x4", argLength: 2},                                         // ARCH:amd64
-		{name: "AddPairsSaturatedGroupedInt16x16", argLength: 2},                        // ARCH:amd64
-		{name: "AddPairsSaturatedInt16x8", argLength: 2},                                // ARCH:amd64
-		{name: "AddPairsUint16x8", argLength: 2},                                        // ARCH:amd64
-		{name: "AddPairsUint32x4", argLength: 2},                                        // ARCH:amd64
 		{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},                  // ARCH:amd64
 		{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},                  // ARCH:amd64
 		{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},                  // ARCH:amd64
@@ -177,6 +163,20 @@
 		{name: "CompressUint64x2", argLength: 2},                                        // ARCH:amd64
 		{name: "CompressUint64x4", argLength: 2},                                        // ARCH:amd64
 		{name: "CompressUint64x8", argLength: 2},                                        // ARCH:amd64
+		{name: "ConcatAddPairsFloat32x4", argLength: 2},                                 // ARCH:amd64
+		{name: "ConcatAddPairsFloat64x2", argLength: 2},                                 // ARCH:amd64
+		{name: "ConcatAddPairsGroupedFloat32x8", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatAddPairsGroupedFloat64x4", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatAddPairsGroupedInt16x16", argLength: 2},                           // ARCH:amd64
+		{name: "ConcatAddPairsGroupedInt32x8", argLength: 2},                            // ARCH:amd64
+		{name: "ConcatAddPairsGroupedUint16x16", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatAddPairsGroupedUint32x8", argLength: 2},                           // ARCH:amd64
+		{name: "ConcatAddPairsInt16x8", argLength: 2},                                   // ARCH:amd64
+		{name: "ConcatAddPairsInt32x4", argLength: 2},                                   // ARCH:amd64
+		{name: "ConcatAddPairsSaturatedGroupedInt16x16", argLength: 2},                  // ARCH:amd64
+		{name: "ConcatAddPairsSaturatedInt16x8", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatAddPairsUint16x8", argLength: 2},                                  // ARCH:amd64
+		{name: "ConcatAddPairsUint32x4", argLength: 2},                                  // ARCH:amd64
 		{name: "ConcatPermuteFloat32x4", argLength: 3},                                  // ARCH:amd64
 		{name: "ConcatPermuteFloat32x8", argLength: 3},                                  // ARCH:amd64
 		{name: "ConcatPermuteFloat32x16", argLength: 3},                                 // ARCH:amd64
@@ -207,6 +207,20 @@
 		{name: "ConcatPermuteUint64x2", argLength: 3},                                   // ARCH:amd64
 		{name: "ConcatPermuteUint64x4", argLength: 3},                                   // ARCH:amd64
 		{name: "ConcatPermuteUint64x8", argLength: 3},                                   // ARCH:amd64
+		{name: "ConcatSubPairsFloat32x4", argLength: 2},                                 // ARCH:amd64
+		{name: "ConcatSubPairsFloat64x2", argLength: 2},                                 // ARCH:amd64
+		{name: "ConcatSubPairsGroupedFloat32x8", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatSubPairsGroupedFloat64x4", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatSubPairsGroupedInt16x16", argLength: 2},                           // ARCH:amd64
+		{name: "ConcatSubPairsGroupedInt32x8", argLength: 2},                            // ARCH:amd64
+		{name: "ConcatSubPairsGroupedUint16x16", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatSubPairsGroupedUint32x8", argLength: 2},                           // ARCH:amd64
+		{name: "ConcatSubPairsInt16x8", argLength: 2},                                   // ARCH:amd64
+		{name: "ConcatSubPairsInt32x4", argLength: 2},                                   // ARCH:amd64
+		{name: "ConcatSubPairsSaturatedGroupedInt16x16", argLength: 2},                  // ARCH:amd64
+		{name: "ConcatSubPairsSaturatedInt16x8", argLength: 2},                          // ARCH:amd64
+		{name: "ConcatSubPairsUint16x8", argLength: 2},                                  // ARCH:amd64
+		{name: "ConcatSubPairsUint32x4", argLength: 2},                                  // ARCH:amd64
 		{name: "ConvertToFloat32Float64x2", argLength: 1},                               // ARCH:amd64
 		{name: "ConvertToFloat32Float64x4", argLength: 1},                               // ARCH:amd64
 		{name: "ConvertToFloat32Float64x8", argLength: 1},                               // ARCH:amd64
@@ -999,20 +1013,6 @@
 		{name: "SubInt64x2", argLength: 2},                                              // ARCH:amd64
 		{name: "SubInt64x4", argLength: 2},                                              // ARCH:amd64
 		{name: "SubInt64x8", argLength: 2},                                              // ARCH:amd64
-		{name: "SubPairsFloat32x4", argLength: 2},                                       // ARCH:amd64
-		{name: "SubPairsFloat64x2", argLength: 2},                                       // ARCH:amd64
-		{name: "SubPairsGroupedFloat32x8", argLength: 2},                                // ARCH:amd64
-		{name: "SubPairsGroupedFloat64x4", argLength: 2},                                // ARCH:amd64
-		{name: "SubPairsGroupedInt16x16", argLength: 2},                                 // ARCH:amd64
-		{name: "SubPairsGroupedInt32x8", argLength: 2},                                  // ARCH:amd64
-		{name: "SubPairsGroupedUint16x16", argLength: 2},                                // ARCH:amd64
-		{name: "SubPairsGroupedUint32x8", argLength: 2},                                 // ARCH:amd64
-		{name: "SubPairsInt16x8", argLength: 2},                                         // ARCH:amd64
-		{name: "SubPairsInt32x4", argLength: 2},                                         // ARCH:amd64
-		{name: "SubPairsSaturatedGroupedInt16x16", argLength: 2},                        // ARCH:amd64
-		{name: "SubPairsSaturatedInt16x8", argLength: 2},                                // ARCH:amd64
-		{name: "SubPairsUint16x8", argLength: 2},                                        // ARCH:amd64
-		{name: "SubPairsUint32x4", argLength: 2},                                        // ARCH:amd64
 		{name: "SubSaturatedInt8x16", argLength: 2},                                     // ARCH:amd64
 		{name: "SubSaturatedInt8x32", argLength: 2},                                     // ARCH:amd64
 		{name: "SubSaturatedInt8x64", argLength: 2},                                     // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index acb2d81..52a6b39 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6228,20 +6228,6 @@
 	OpAddInt64x2
 	OpAddInt64x4
 	OpAddInt64x8
-	OpAddPairsFloat32x4
-	OpAddPairsFloat64x2
-	OpAddPairsGroupedFloat32x8
-	OpAddPairsGroupedFloat64x4
-	OpAddPairsGroupedInt16x16
-	OpAddPairsGroupedInt32x8
-	OpAddPairsGroupedUint16x16
-	OpAddPairsGroupedUint32x8
-	OpAddPairsInt16x8
-	OpAddPairsInt32x4
-	OpAddPairsSaturatedGroupedInt16x16
-	OpAddPairsSaturatedInt16x8
-	OpAddPairsUint16x8
-	OpAddPairsUint32x4
 	OpAddSaturatedInt8x16
 	OpAddSaturatedInt8x32
 	OpAddSaturatedInt8x64
@@ -6358,6 +6344,20 @@
 	OpCompressUint64x2
 	OpCompressUint64x4
 	OpCompressUint64x8
+	OpConcatAddPairsFloat32x4
+	OpConcatAddPairsFloat64x2
+	OpConcatAddPairsGroupedFloat32x8
+	OpConcatAddPairsGroupedFloat64x4
+	OpConcatAddPairsGroupedInt16x16
+	OpConcatAddPairsGroupedInt32x8
+	OpConcatAddPairsGroupedUint16x16
+	OpConcatAddPairsGroupedUint32x8
+	OpConcatAddPairsInt16x8
+	OpConcatAddPairsInt32x4
+	OpConcatAddPairsSaturatedGroupedInt16x16
+	OpConcatAddPairsSaturatedInt16x8
+	OpConcatAddPairsUint16x8
+	OpConcatAddPairsUint32x4
 	OpConcatPermuteFloat32x4
 	OpConcatPermuteFloat32x8
 	OpConcatPermuteFloat32x16
@@ -6388,6 +6388,20 @@
 	OpConcatPermuteUint64x2
 	OpConcatPermuteUint64x4
 	OpConcatPermuteUint64x8
+	OpConcatSubPairsFloat32x4
+	OpConcatSubPairsFloat64x2
+	OpConcatSubPairsGroupedFloat32x8
+	OpConcatSubPairsGroupedFloat64x4
+	OpConcatSubPairsGroupedInt16x16
+	OpConcatSubPairsGroupedInt32x8
+	OpConcatSubPairsGroupedUint16x16
+	OpConcatSubPairsGroupedUint32x8
+	OpConcatSubPairsInt16x8
+	OpConcatSubPairsInt32x4
+	OpConcatSubPairsSaturatedGroupedInt16x16
+	OpConcatSubPairsSaturatedInt16x8
+	OpConcatSubPairsUint16x8
+	OpConcatSubPairsUint32x4
 	OpConvertToFloat32Float64x2
 	OpConvertToFloat32Float64x4
 	OpConvertToFloat32Float64x8
@@ -7180,20 +7194,6 @@
 	OpSubInt64x2
 	OpSubInt64x4
 	OpSubInt64x8
-	OpSubPairsFloat32x4
-	OpSubPairsFloat64x2
-	OpSubPairsGroupedFloat32x8
-	OpSubPairsGroupedFloat64x4
-	OpSubPairsGroupedInt16x16
-	OpSubPairsGroupedInt32x8
-	OpSubPairsGroupedUint16x16
-	OpSubPairsGroupedUint32x8
-	OpSubPairsInt16x8
-	OpSubPairsInt32x4
-	OpSubPairsSaturatedGroupedInt16x16
-	OpSubPairsSaturatedInt16x8
-	OpSubPairsUint16x8
-	OpSubPairsUint32x4
 	OpSubSaturatedInt8x16
 	OpSubSaturatedInt8x32
 	OpSubSaturatedInt8x64
@@ -89498,76 +89498,6 @@
 		generic:     true,
 	},
 	{
-		name:    "AddPairsFloat32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsFloat64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsGroupedFloat32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsGroupedFloat64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsGroupedInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsGroupedInt32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsGroupedUint16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsGroupedUint32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsInt32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsSaturatedGroupedInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsSaturatedInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsUint16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "AddPairsUint32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
 		name:        "AddSaturatedInt8x16",
 		argLen:      2,
 		commutative: true,
@@ -90202,6 +90132,76 @@
 		generic: true,
 	},
 	{
+		name:    "ConcatAddPairsFloat32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsFloat64x2",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsGroupedFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsGroupedFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsGroupedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsGroupedInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsGroupedUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsGroupedUint32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsInt32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsSaturatedGroupedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsSaturatedInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsUint16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatAddPairsUint32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
 		name:    "ConcatPermuteFloat32x4",
 		argLen:  3,
 		generic: true,
@@ -90352,6 +90352,76 @@
 		generic: true,
 	},
 	{
+		name:    "ConcatSubPairsFloat32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsFloat64x2",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsGroupedFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsGroupedFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsGroupedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsGroupedInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsGroupedUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsGroupedUint32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsInt32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsSaturatedGroupedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsSaturatedInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsUint16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatSubPairsUint32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
 		name:    "ConvertToFloat32Float64x2",
 		argLen:  1,
 		generic: true,
@@ -94474,76 +94544,6 @@
 		generic: true,
 	},
 	{
-		name:    "SubPairsFloat32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsFloat64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsGroupedFloat32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsGroupedFloat64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsGroupedInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsGroupedInt32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsGroupedUint16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsGroupedUint32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsInt32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsSaturatedGroupedInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsSaturatedInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsUint16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SubPairsUint32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
 		name:    "SubSaturatedInt8x16",
 		argLen:  2,
 		generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index c2c09cc..179e492 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2192,48 +2192,6 @@
 	case OpAddInt8x64:
 		v.Op = OpAMD64VPADDB512
 		return true
-	case OpAddPairsFloat32x4:
-		v.Op = OpAMD64VHADDPS128
-		return true
-	case OpAddPairsFloat64x2:
-		v.Op = OpAMD64VHADDPD128
-		return true
-	case OpAddPairsGroupedFloat32x8:
-		v.Op = OpAMD64VHADDPS256
-		return true
-	case OpAddPairsGroupedFloat64x4:
-		v.Op = OpAMD64VHADDPD256
-		return true
-	case OpAddPairsGroupedInt16x16:
-		v.Op = OpAMD64VPHADDW256
-		return true
-	case OpAddPairsGroupedInt32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
-	case OpAddPairsGroupedUint16x16:
-		v.Op = OpAMD64VPHADDW256
-		return true
-	case OpAddPairsGroupedUint32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
-	case OpAddPairsInt16x8:
-		v.Op = OpAMD64VPHADDW128
-		return true
-	case OpAddPairsInt32x4:
-		v.Op = OpAMD64VPHADDD128
-		return true
-	case OpAddPairsSaturatedGroupedInt16x16:
-		v.Op = OpAMD64VPHADDSW256
-		return true
-	case OpAddPairsSaturatedInt16x8:
-		v.Op = OpAMD64VPHADDSW128
-		return true
-	case OpAddPairsUint16x8:
-		v.Op = OpAMD64VPHADDW128
-		return true
-	case OpAddPairsUint32x4:
-		v.Op = OpAMD64VPHADDD128
-		return true
 	case OpAddPtr:
 		v.Op = OpAMD64ADDQ
 		return true
@@ -2674,6 +2632,48 @@
 		return rewriteValueAMD64_OpCompressUint8x32(v)
 	case OpCompressUint8x64:
 		return rewriteValueAMD64_OpCompressUint8x64(v)
+	case OpConcatAddPairsFloat32x4:
+		v.Op = OpAMD64VHADDPS128
+		return true
+	case OpConcatAddPairsFloat64x2:
+		v.Op = OpAMD64VHADDPD128
+		return true
+	case OpConcatAddPairsGroupedFloat32x8:
+		v.Op = OpAMD64VHADDPS256
+		return true
+	case OpConcatAddPairsGroupedFloat64x4:
+		v.Op = OpAMD64VHADDPD256
+		return true
+	case OpConcatAddPairsGroupedInt16x16:
+		v.Op = OpAMD64VPHADDW256
+		return true
+	case OpConcatAddPairsGroupedInt32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
+	case OpConcatAddPairsGroupedUint16x16:
+		v.Op = OpAMD64VPHADDW256
+		return true
+	case OpConcatAddPairsGroupedUint32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
+	case OpConcatAddPairsInt16x8:
+		v.Op = OpAMD64VPHADDW128
+		return true
+	case OpConcatAddPairsInt32x4:
+		v.Op = OpAMD64VPHADDD128
+		return true
+	case OpConcatAddPairsSaturatedGroupedInt16x16:
+		v.Op = OpAMD64VPHADDSW256
+		return true
+	case OpConcatAddPairsSaturatedInt16x8:
+		v.Op = OpAMD64VPHADDSW128
+		return true
+	case OpConcatAddPairsUint16x8:
+		v.Op = OpAMD64VPHADDW128
+		return true
+	case OpConcatAddPairsUint32x4:
+		v.Op = OpAMD64VPHADDD128
+		return true
 	case OpConcatPermuteFloat32x16:
 		v.Op = OpAMD64VPERMI2PS512
 		return true
@@ -2773,6 +2773,48 @@
 	case OpConcatShiftBytesRightUint8x16:
 		v.Op = OpAMD64VPALIGNR128
 		return true
+	case OpConcatSubPairsFloat32x4:
+		v.Op = OpAMD64VHSUBPS128
+		return true
+	case OpConcatSubPairsFloat64x2:
+		v.Op = OpAMD64VHSUBPD128
+		return true
+	case OpConcatSubPairsGroupedFloat32x8:
+		v.Op = OpAMD64VHSUBPS256
+		return true
+	case OpConcatSubPairsGroupedFloat64x4:
+		v.Op = OpAMD64VHSUBPD256
+		return true
+	case OpConcatSubPairsGroupedInt16x16:
+		v.Op = OpAMD64VPHSUBW256
+		return true
+	case OpConcatSubPairsGroupedInt32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
+	case OpConcatSubPairsGroupedUint16x16:
+		v.Op = OpAMD64VPHSUBW256
+		return true
+	case OpConcatSubPairsGroupedUint32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
+	case OpConcatSubPairsInt16x8:
+		v.Op = OpAMD64VPHSUBW128
+		return true
+	case OpConcatSubPairsInt32x4:
+		v.Op = OpAMD64VPHSUBD128
+		return true
+	case OpConcatSubPairsSaturatedGroupedInt16x16:
+		v.Op = OpAMD64VPHSUBSW256
+		return true
+	case OpConcatSubPairsSaturatedInt16x8:
+		v.Op = OpAMD64VPHSUBSW128
+		return true
+	case OpConcatSubPairsUint16x8:
+		v.Op = OpAMD64VPHSUBW128
+		return true
+	case OpConcatSubPairsUint32x4:
+		v.Op = OpAMD64VPHSUBD128
+		return true
 	case OpCondSelect:
 		return rewriteValueAMD64_OpCondSelect(v)
 	case OpConst16:
@@ -5855,48 +5897,6 @@
 	case OpSubInt8x64:
 		v.Op = OpAMD64VPSUBB512
 		return true
-	case OpSubPairsFloat32x4:
-		v.Op = OpAMD64VHSUBPS128
-		return true
-	case OpSubPairsFloat64x2:
-		v.Op = OpAMD64VHSUBPD128
-		return true
-	case OpSubPairsGroupedFloat32x8:
-		v.Op = OpAMD64VHSUBPS256
-		return true
-	case OpSubPairsGroupedFloat64x4:
-		v.Op = OpAMD64VHSUBPD256
-		return true
-	case OpSubPairsGroupedInt16x16:
-		v.Op = OpAMD64VPHSUBW256
-		return true
-	case OpSubPairsGroupedInt32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
-	case OpSubPairsGroupedUint16x16:
-		v.Op = OpAMD64VPHSUBW256
-		return true
-	case OpSubPairsGroupedUint32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
-	case OpSubPairsInt16x8:
-		v.Op = OpAMD64VPHSUBW128
-		return true
-	case OpSubPairsInt32x4:
-		v.Op = OpAMD64VPHSUBD128
-		return true
-	case OpSubPairsSaturatedGroupedInt16x16:
-		v.Op = OpAMD64VPHSUBSW256
-		return true
-	case OpSubPairsSaturatedInt16x8:
-		v.Op = OpAMD64VPHSUBSW128
-		return true
-	case OpSubPairsUint16x8:
-		v.Op = OpAMD64VPHSUBW128
-		return true
-	case OpSubPairsUint32x4:
-		v.Op = OpAMD64VPHSUBD128
-		return true
 	case OpSubPtr:
 		v.Op = OpAMD64SUBQ
 		return true
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index e790254..5c94e17 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -66,20 +66,6 @@
 	addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
@@ -196,6 +182,20 @@
 	addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.ConcatAddPairsSaturated", opLen2(ssa.OpConcatAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatAddPairsSaturatedGrouped", opLen2(ssa.OpConcatAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
@@ -229,6 +229,20 @@
 	addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Float32x4.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.ConcatSubPairsSaturated", opLen2(ssa.OpConcatSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatSubPairsSaturatedGrouped", opLen2(ssa.OpConcatSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x8.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x8, types.TypeVec256), sys.AMD64)
@@ -1154,20 +1168,6 @@
 	addF(simdPackage, "Uint64x2.Sub", opLen2(ssa.OpSubUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
index ac5bd82..32c26d0 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
@@ -15,45 +15,45 @@
   commutative: false
   documentation: !string |-
     // NAME subtracts corresponding elements of two vectors with saturation.
-- go: AddPairs
+- go: ConcatAddPairs
   commutative: false
   out:
   - elemBits: 16|32
   documentation: !string |-
     // NAME horizontally adds adjacent pairs of elements.
     // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: AddPairs
+- go: ConcatAddPairs
   commutative: false
   out:
   - elemBits: 64
   documentation: !string |-
     // NAME horizontally adds adjacent pairs of elements.
     // For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-- go: SubPairs
+- go: ConcatSubPairs
   commutative: false
   out:
   - elemBits: 16|32
   documentation: !string |-
     // NAME horizontally subtracts adjacent pairs of elements.
     // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-- go: SubPairs
+- go: ConcatSubPairs
   commutative: false
   out:
   - elemBits: 64
   documentation: !string |-
     // NAME horizontally subtracts adjacent pairs of elements.
     // For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-- go: AddPairsSaturated
+- go: ConcatAddPairsSaturated
   commutative: false
   documentation: !string |-
     // NAME horizontally adds adjacent pairs of elements with saturation.
     // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: SubPairsSaturated
+- go: ConcatSubPairsSaturated
   commutative: false
   documentation: !string |-
     // NAME horizontally subtracts adjacent pairs of elements with saturation.
     // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
   commutative: false
   out:
   - elemBits: 16|32
@@ -61,7 +61,7 @@
     // NAME horizontally adds adjacent pairs of elements.
     // With each 128-bit as a group:
     // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
   commutative: false
   out:
   - elemBits: 64
@@ -69,7 +69,7 @@
     // NAME horizontally adds adjacent pairs of elements.
     // With each 128-bit as a group:
     // for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
   commutative: false
   out:
   - elemBits: 16|32
@@ -77,7 +77,7 @@
     // NAME horizontally subtracts adjacent pairs of elements.
     // With each 128-bit as a group:
     // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
   commutative: false
   out:
   - elemBits: 64
@@ -85,13 +85,13 @@
     // NAME horizontally subtracts adjacent pairs of elements.
     // With each 128-bit as a group:
     // for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-- go: AddPairsSaturatedGrouped
+- go: ConcatAddPairsSaturatedGrouped
   commutative: false
   documentation: !string |-
     // NAME horizontally adds adjacent pairs of elements with saturation.
     // With each 128-bit as a group:
     // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: SubPairsSaturatedGrouped
+- go: ConcatSubPairsSaturatedGrouped
   commutative: false
   documentation: !string |-
     // NAME horizontally subtracts adjacent pairs of elements with saturation.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
index 17cee59..e834562 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
@@ -51,7 +51,7 @@
   - *uint
   out:
   - *uint
-- go: AddPairs
+- go: ConcatAddPairs
   asm: "VPHADD[DW]"
   in: &2any128
   - &any128
@@ -60,19 +60,19 @@
   - *any128
   out: &1any128
   - *any128
-- go: SubPairs
+- go: ConcatSubPairs
   asm: "VPHSUB[DW]"
   in: *2any128
   out: *1any128
-- go: AddPairs
+- go: ConcatAddPairs
   asm: "VHADDP[SD]" # floats
   in: *2any128
   out: *1any128
-- go: SubPairs
+- go: ConcatSubPairs
   asm: "VHSUBP[SD]"  # floats
   in: *2any128
   out: *1any128
-- go: AddPairsSaturated
+- go: ConcatAddPairsSaturated
   asm: "VPHADDS[DW]"
   in: &2int128
   - &int128
@@ -82,11 +82,11 @@
   - *int128
   out: &1int128
   - *int128
-- go: SubPairsSaturated
+- go: ConcatSubPairsSaturated
   asm: "VPHSUBS[DW]"
   in: *2int128
   out: *1int128
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
   asm: "VPHADD[DW]"
   in: &2any256
   - &any256
@@ -95,19 +95,19 @@
   - *any256
   out: &1any256
   - *any256
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
   asm: "VPHSUB[DW]"
   in: *2any256
   out: *1any256
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
   asm: "VHADDP[SD]" # floats
   in: *2any256
   out: *1any256
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
   asm: "VHSUBP[SD]"  # floats
   in: *2any256
   out: *1any256
-- go: AddPairsSaturatedGrouped
+- go: ConcatAddPairsSaturatedGrouped
   asm: "VPHADDS[DW]"
   in: &2int256
   - &int256
@@ -117,7 +117,7 @@
   - *int256
   out: &1int256
   - *int256
-- go: SubPairsSaturatedGrouped
+- go: ConcatSubPairsSaturatedGrouped
   asm: "VPHSUBS[DW]"
   in: *2int256
   out: *1int256
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 71366f6..224d0bf 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -1329,33 +1329,33 @@
 }
 
 func TestAddSubPairs(t *testing.T) {
-	testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
-	testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
-	testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
-	testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
-	testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
-	testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
-	testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
-	testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
-	testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
-	testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
-	testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
-	testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
+	testInt16x8Binary(t, archsimd.Int16x8.ConcatAddPairs, addPairsSlice[int16])
+	testInt16x8Binary(t, archsimd.Int16x8.ConcatSubPairs, subPairsSlice[int16])
+	testUint16x8Binary(t, archsimd.Uint16x8.ConcatAddPairs, addPairsSlice[uint16])
+	testUint16x8Binary(t, archsimd.Uint16x8.ConcatSubPairs, subPairsSlice[uint16])
+	testInt32x4Binary(t, archsimd.Int32x4.ConcatAddPairs, addPairsSlice[int32])
+	testInt32x4Binary(t, archsimd.Int32x4.ConcatSubPairs, subPairsSlice[int32])
+	testUint32x4Binary(t, archsimd.Uint32x4.ConcatAddPairs, addPairsSlice[uint32])
+	testUint32x4Binary(t, archsimd.Uint32x4.ConcatSubPairs, subPairsSlice[uint32])
+	testFloat32x4Binary(t, archsimd.Float32x4.ConcatAddPairs, addPairsSlice[float32])
+	testFloat32x4Binary(t, archsimd.Float32x4.ConcatSubPairs, subPairsSlice[float32])
+	testFloat64x2Binary(t, archsimd.Float64x2.ConcatAddPairs, addPairsSlice[float64])
+	testFloat64x2Binary(t, archsimd.Float64x2.ConcatSubPairs, subPairsSlice[float64])
 
 	// Grouped versions
 	if archsimd.X86.AVX2() {
-		testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
-		testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
-		testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
-		testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
-		testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
-		testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
-		testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
-		testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
-		testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
-		testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
-		testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
-		testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
+		testInt16x16Binary(t, archsimd.Int16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[int16])
+		testInt16x16Binary(t, archsimd.Int16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[int16])
+		testUint16x16Binary(t, archsimd.Uint16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[uint16])
+		testUint16x16Binary(t, archsimd.Uint16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[uint16])
+		testInt32x8Binary(t, archsimd.Int32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[int32])
+		testInt32x8Binary(t, archsimd.Int32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[int32])
+		testUint32x8Binary(t, archsimd.Uint32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[uint32])
+		testUint32x8Binary(t, archsimd.Uint32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[uint32])
+		testFloat32x8Binary(t, archsimd.Float32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[float32])
+		testFloat32x8Binary(t, archsimd.Float32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[float32])
+		testFloat64x4Binary(t, archsimd.Float64x4.ConcatAddPairsGrouped, addPairsGroupedSlice[float64])
+		testFloat64x4Binary(t, archsimd.Float64x4.ConcatSubPairsGrouped, subPairsGroupedSlice[float64])
 	}
 }
 
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index b105206..5cfe5fb 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -346,105 +346,6 @@
 // Asm: VPADDQ, CPU Feature: AVX512
 func (x Uint64x8) Add(y Uint64x8) Uint64x8
 
-/* AddPairs */
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x4) AddPairs(y Float32x4) Float32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x2) AddPairs(y Float64x2) Float64x2
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) AddPairs(y Int16x8) Int16x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) AddPairs(y Int32x4) Int32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
-
-/* AddPairsGrouped */
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8
-
-/* AddPairsSaturated */
-
-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX
-func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
-
-/* AddPairsSaturatedGrouped */
-
-// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16
-
 /* AddSaturated */
 
 // AddSaturated adds corresponding elements of two vectors with saturation.
@@ -1097,6 +998,105 @@
 // Asm: VPCOMPRESSQ, CPU Feature: AVX512
 func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
 
+/* ConcatAddPairs */
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x4) ConcatAddPairs(y Float32x4) Float32x4
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x2) ConcatAddPairs(y Float64x2) Float64x2
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) ConcatAddPairs(y Int16x8) Int16x8
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) ConcatAddPairs(y Int32x4) Int32x4
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) ConcatAddPairs(y Uint16x8) Uint16x8
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Uint32x4) ConcatAddPairs(y Uint32x4) Uint32x4
+
+/* ConcatAddPairsGrouped */
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) ConcatAddPairsGrouped(y Float32x8) Float32x8
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) ConcatAddPairsGrouped(y Float64x4) Float64x4
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Int16x16) ConcatAddPairsGrouped(y Int16x16) Int16x16
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) ConcatAddPairsGrouped(y Int32x8) Int32x8
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) ConcatAddPairsGrouped(y Uint16x16) Uint16x16
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Uint32x8) ConcatAddPairsGrouped(y Uint32x8) Uint32x8
+
+/* ConcatAddPairsSaturated */
+
+// ConcatAddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX
+func (x Int16x8) ConcatAddPairsSaturated(y Int16x8) Int16x8
+
+/* ConcatAddPairsSaturatedGrouped */
+
+// ConcatAddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX2
+func (x Int16x16) ConcatAddPairsSaturatedGrouped(y Int16x16) Int16x16
+
 /* ConcatPermute */
 
 // ConcatPermute performs a full permutation of vector x, y using indices:
@@ -1429,6 +1429,105 @@
 // Asm: VPALIGNR, CPU Feature: AVX512
 func (x Uint8x64) ConcatShiftBytesRightGrouped(y Uint8x64, shift uint8) Uint8x64
 
+/* ConcatSubPairs */
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x4) ConcatSubPairs(y Float32x4) Float32x4
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x2) ConcatSubPairs(y Float64x2) Float64x2
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) ConcatSubPairs(y Int16x8) Int16x8
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) ConcatSubPairs(y Int32x4) Int32x4
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) ConcatSubPairs(y Uint16x8) Uint16x8
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Uint32x4) ConcatSubPairs(y Uint32x4) Uint32x4
+
+/* ConcatSubPairsGrouped */
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) ConcatSubPairsGrouped(y Float32x8) Float32x8
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) ConcatSubPairsGrouped(y Float64x4) Float64x4
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Int16x16) ConcatSubPairsGrouped(y Int16x16) Int16x16
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) ConcatSubPairsGrouped(y Int32x8) Int32x8
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) ConcatSubPairsGrouped(y Uint16x16) Uint16x16
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Uint32x8) ConcatSubPairsGrouped(y Uint32x8) Uint32x8
+
+/* ConcatSubPairsSaturated */
+
+// ConcatSubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX
+func (x Int16x8) ConcatSubPairsSaturated(y Int16x8) Int16x8
+
+/* ConcatSubPairsSaturatedGrouped */
+
+// ConcatSubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX2
+func (x Int16x16) ConcatSubPairsSaturatedGrouped(y Int16x16) Int16x16
+
 /* ConvertToFloat32 */
 
 // ConvertToFloat32 converts element values to float32.
@@ -7170,105 +7269,6 @@
 // Asm: VPSUBQ, CPU Feature: AVX512
 func (x Uint64x8) Sub(y Uint64x8) Uint64x8
 
-/* SubPairs */
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x4) SubPairs(y Float32x4) Float32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x2) SubPairs(y Float64x2) Float64x2
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) SubPairs(y Int16x8) Int16x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) SubPairs(y Int32x4) Int32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
-
-/* SubPairsGrouped */
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8
-
-/* SubPairsSaturated */
-
-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX
-func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
-
-/* SubPairsSaturatedGrouped */
-
-// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16
-
 /* SubSaturated */
 
 // SubSaturated subtracts corresponding elements of two vectors with saturation.

Change information

Files:

M src/cmd/compile/internal/amd64/simdssa.go
M src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
M src/cmd/compile/internal/ssa/opGen.go
M src/cmd/compile/internal/ssa/rewriteAMD64.go
M src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
M src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
M src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
M src/simd/archsimd/internal/simd_test/simd_test.go
M src/simd/archsimd/ops_amd64.go

Change size: XL

Delta: 10 files changed, 602 insertions(+), 602 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Junyang Shao (Gerrit)

unread,

5:11 PM (6 hours ago) 5:11 PM

to goph...@pubsubhelper.golang.org, David Chase, golang-co...@googlegroups.com

Attention needed from David Chase

Junyang Shao voted Commit-Queue+1

Commit-Queue

Open in Gerrit

Related details

Attention is currently required from:

David Chase

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Reply all

Reply to author

Forward

0 new messages