[go/dev.simd] [dev.simd] simd, cmd/compile: rename (Add|Sub)Pairs* to Concat(Add|Sub)Pairs*

1 view
Skip to first unread message

Junyang Shao (Gerrit)

unread,
5:00 PM (6 hours ago) 5:00 PM
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Junyang Shao has uploaded the change for review

Commit message

[dev.simd] simd, cmd/compile: rename (Add|Sub)Pairs* to Concat(Add|Sub)Pairs*

For #78979.
Change-Id: I7a3f6fff272bc7630c116dc265d1677c0187d12a

Change diff

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 618a7ad..402e711 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -249,16 +249,6 @@
ssa.OpAMD64VPADDQ128,
ssa.OpAMD64VPADDQ256,
ssa.OpAMD64VPADDQ512,
- ssa.OpAMD64VHADDPS128,
- ssa.OpAMD64VHADDPD128,
- ssa.OpAMD64VPHADDW128,
- ssa.OpAMD64VPHADDD128,
- ssa.OpAMD64VHADDPS256,
- ssa.OpAMD64VHADDPD256,
- ssa.OpAMD64VPHADDW256,
- ssa.OpAMD64VPHADDD256,
- ssa.OpAMD64VPHADDSW128,
- ssa.OpAMD64VPHADDSW256,
ssa.OpAMD64VPADDSB128,
ssa.OpAMD64VPADDSB256,
ssa.OpAMD64VPADDSB512,
@@ -289,6 +279,26 @@
ssa.OpAMD64VPAVGW128,
ssa.OpAMD64VPAVGW256,
ssa.OpAMD64VPAVGW512,
+ ssa.OpAMD64VHADDPS128,
+ ssa.OpAMD64VHADDPD128,
+ ssa.OpAMD64VPHADDW128,
+ ssa.OpAMD64VPHADDD128,
+ ssa.OpAMD64VHADDPS256,
+ ssa.OpAMD64VHADDPD256,
+ ssa.OpAMD64VPHADDW256,
+ ssa.OpAMD64VPHADDD256,
+ ssa.OpAMD64VPHADDSW128,
+ ssa.OpAMD64VPHADDSW256,
+ ssa.OpAMD64VHSUBPS128,
+ ssa.OpAMD64VHSUBPD128,
+ ssa.OpAMD64VPHSUBW128,
+ ssa.OpAMD64VPHSUBD128,
+ ssa.OpAMD64VHSUBPS256,
+ ssa.OpAMD64VHSUBPD256,
+ ssa.OpAMD64VPHSUBW256,
+ ssa.OpAMD64VPHSUBD256,
+ ssa.OpAMD64VPHSUBSW128,
+ ssa.OpAMD64VPHSUBSW256,
ssa.OpAMD64VDIVPS128,
ssa.OpAMD64VDIVPS256,
ssa.OpAMD64VDIVPS512,
@@ -519,16 +529,6 @@
ssa.OpAMD64VPSUBQ128,
ssa.OpAMD64VPSUBQ256,
ssa.OpAMD64VPSUBQ512,
- ssa.OpAMD64VHSUBPS128,
- ssa.OpAMD64VHSUBPD128,
- ssa.OpAMD64VPHSUBW128,
- ssa.OpAMD64VPHSUBD128,
- ssa.OpAMD64VHSUBPS256,
- ssa.OpAMD64VHSUBPD256,
- ssa.OpAMD64VPHSUBW256,
- ssa.OpAMD64VPHSUBD256,
- ssa.OpAMD64VPHSUBSW128,
- ssa.OpAMD64VPHSUBSW256,
ssa.OpAMD64VPSUBSB128,
ssa.OpAMD64VPSUBSB256,
ssa.OpAMD64VPSUBSB512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 8fdf860..cdbedbc 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -56,20 +56,6 @@
(AddUint64x2 ...) => (VPADDQ128 ...)
(AddUint64x4 ...) => (VPADDQ256 ...)
(AddUint64x8 ...) => (VPADDQ512 ...)
-(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
-(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
-(AddPairsInt16x8 ...) => (VPHADDW128 ...)
-(AddPairsInt32x4 ...) => (VPHADDD128 ...)
-(AddPairsUint16x8 ...) => (VPHADDW128 ...)
-(AddPairsUint32x4 ...) => (VPHADDD128 ...)
-(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
-(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
-(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
-(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
-(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
-(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
-(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
-(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
(AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
(AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
(AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
@@ -186,6 +172,20 @@
(CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
(CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
(CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatAddPairsFloat32x4 ...) => (VHADDPS128 ...)
+(ConcatAddPairsFloat64x2 ...) => (VHADDPD128 ...)
+(ConcatAddPairsInt16x8 ...) => (VPHADDW128 ...)
+(ConcatAddPairsInt32x4 ...) => (VPHADDD128 ...)
+(ConcatAddPairsUint16x8 ...) => (VPHADDW128 ...)
+(ConcatAddPairsUint32x4 ...) => (VPHADDD128 ...)
+(ConcatAddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
+(ConcatAddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
+(ConcatAddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
+(ConcatAddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
+(ConcatAddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
+(ConcatAddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
+(ConcatAddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
+(ConcatAddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
@@ -219,6 +219,20 @@
(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
+(ConcatSubPairsFloat32x4 ...) => (VHSUBPS128 ...)
+(ConcatSubPairsFloat64x2 ...) => (VHSUBPD128 ...)
+(ConcatSubPairsInt16x8 ...) => (VPHSUBW128 ...)
+(ConcatSubPairsInt32x4 ...) => (VPHSUBD128 ...)
+(ConcatSubPairsUint16x8 ...) => (VPHSUBW128 ...)
+(ConcatSubPairsUint32x4 ...) => (VPHSUBD128 ...)
+(ConcatSubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
+(ConcatSubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
+(ConcatSubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
+(ConcatSubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
+(ConcatSubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
+(ConcatSubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
+(ConcatSubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
+(ConcatSubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
(ConvertToFloat32Float64x2 ...) => (VCVTPD2PSX128 ...)
(ConvertToFloat32Float64x4 ...) => (VCVTPD2PSY128 ...)
(ConvertToFloat32Float64x8 ...) => (VCVTPD2PS256 ...)
@@ -1198,20 +1212,6 @@
(SubUint64x2 ...) => (VPSUBQ128 ...)
(SubUint64x4 ...) => (VPSUBQ256 ...)
(SubUint64x8 ...) => (VPSUBQ512 ...)
-(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
-(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
-(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
-(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
-(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
-(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
-(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
-(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
-(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
-(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
-(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
-(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
-(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
-(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index f1d87ab..3644bbc 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -47,20 +47,6 @@
{name: "AddInt64x2", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AddInt64x4", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AddInt64x8", argLength: 2, commutative: true}, // ARCH:amd64
- {name: "AddPairsFloat32x4", argLength: 2}, // ARCH:amd64
- {name: "AddPairsFloat64x2", argLength: 2}, // ARCH:amd64
- {name: "AddPairsGroupedFloat32x8", argLength: 2}, // ARCH:amd64
- {name: "AddPairsGroupedFloat64x4", argLength: 2}, // ARCH:amd64
- {name: "AddPairsGroupedInt16x16", argLength: 2}, // ARCH:amd64
- {name: "AddPairsGroupedInt32x8", argLength: 2}, // ARCH:amd64
- {name: "AddPairsGroupedUint16x16", argLength: 2}, // ARCH:amd64
- {name: "AddPairsGroupedUint32x8", argLength: 2}, // ARCH:amd64
- {name: "AddPairsInt16x8", argLength: 2}, // ARCH:amd64
- {name: "AddPairsInt32x4", argLength: 2}, // ARCH:amd64
- {name: "AddPairsSaturatedGroupedInt16x16", argLength: 2}, // ARCH:amd64
- {name: "AddPairsSaturatedInt16x8", argLength: 2}, // ARCH:amd64
- {name: "AddPairsUint16x8", argLength: 2}, // ARCH:amd64
- {name: "AddPairsUint32x4", argLength: 2}, // ARCH:amd64
{name: "AddSaturatedInt8x16", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AddSaturatedInt8x32", argLength: 2, commutative: true}, // ARCH:amd64
{name: "AddSaturatedInt8x64", argLength: 2, commutative: true}, // ARCH:amd64
@@ -177,6 +163,20 @@
{name: "CompressUint64x2", argLength: 2}, // ARCH:amd64
{name: "CompressUint64x4", argLength: 2}, // ARCH:amd64
{name: "CompressUint64x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsGroupedFloat32x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsGroupedFloat64x4", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsGroupedInt16x16", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsGroupedInt32x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsGroupedUint16x16", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsGroupedUint32x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsSaturatedGroupedInt16x16", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsSaturatedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatAddPairsUint32x4", argLength: 2}, // ARCH:amd64
{name: "ConcatPermuteFloat32x4", argLength: 3}, // ARCH:amd64
{name: "ConcatPermuteFloat32x8", argLength: 3}, // ARCH:amd64
{name: "ConcatPermuteFloat32x16", argLength: 3}, // ARCH:amd64
@@ -207,6 +207,20 @@
{name: "ConcatPermuteUint64x2", argLength: 3}, // ARCH:amd64
{name: "ConcatPermuteUint64x4", argLength: 3}, // ARCH:amd64
{name: "ConcatPermuteUint64x8", argLength: 3}, // ARCH:amd64
+ {name: "ConcatSubPairsFloat32x4", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsFloat64x2", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsGroupedFloat32x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsGroupedFloat64x4", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsGroupedInt16x16", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsGroupedInt32x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsGroupedUint16x16", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsGroupedUint32x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsInt32x4", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsSaturatedGroupedInt16x16", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsSaturatedInt16x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsUint16x8", argLength: 2}, // ARCH:amd64
+ {name: "ConcatSubPairsUint32x4", argLength: 2}, // ARCH:amd64
{name: "ConvertToFloat32Float64x2", argLength: 1}, // ARCH:amd64
{name: "ConvertToFloat32Float64x4", argLength: 1}, // ARCH:amd64
{name: "ConvertToFloat32Float64x8", argLength: 1}, // ARCH:amd64
@@ -999,20 +1013,6 @@
{name: "SubInt64x2", argLength: 2}, // ARCH:amd64
{name: "SubInt64x4", argLength: 2}, // ARCH:amd64
{name: "SubInt64x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsFloat32x4", argLength: 2}, // ARCH:amd64
- {name: "SubPairsFloat64x2", argLength: 2}, // ARCH:amd64
- {name: "SubPairsGroupedFloat32x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsGroupedFloat64x4", argLength: 2}, // ARCH:amd64
- {name: "SubPairsGroupedInt16x16", argLength: 2}, // ARCH:amd64
- {name: "SubPairsGroupedInt32x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsGroupedUint16x16", argLength: 2}, // ARCH:amd64
- {name: "SubPairsGroupedUint32x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsInt16x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsInt32x4", argLength: 2}, // ARCH:amd64
- {name: "SubPairsSaturatedGroupedInt16x16", argLength: 2}, // ARCH:amd64
- {name: "SubPairsSaturatedInt16x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsUint16x8", argLength: 2}, // ARCH:amd64
- {name: "SubPairsUint32x4", argLength: 2}, // ARCH:amd64
{name: "SubSaturatedInt8x16", argLength: 2}, // ARCH:amd64
{name: "SubSaturatedInt8x32", argLength: 2}, // ARCH:amd64
{name: "SubSaturatedInt8x64", argLength: 2}, // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index acb2d81..52a6b39 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6228,20 +6228,6 @@
OpAddInt64x2
OpAddInt64x4
OpAddInt64x8
- OpAddPairsFloat32x4
- OpAddPairsFloat64x2
- OpAddPairsGroupedFloat32x8
- OpAddPairsGroupedFloat64x4
- OpAddPairsGroupedInt16x16
- OpAddPairsGroupedInt32x8
- OpAddPairsGroupedUint16x16
- OpAddPairsGroupedUint32x8
- OpAddPairsInt16x8
- OpAddPairsInt32x4
- OpAddPairsSaturatedGroupedInt16x16
- OpAddPairsSaturatedInt16x8
- OpAddPairsUint16x8
- OpAddPairsUint32x4
OpAddSaturatedInt8x16
OpAddSaturatedInt8x32
OpAddSaturatedInt8x64
@@ -6358,6 +6344,20 @@
OpCompressUint64x2
OpCompressUint64x4
OpCompressUint64x8
+ OpConcatAddPairsFloat32x4
+ OpConcatAddPairsFloat64x2
+ OpConcatAddPairsGroupedFloat32x8
+ OpConcatAddPairsGroupedFloat64x4
+ OpConcatAddPairsGroupedInt16x16
+ OpConcatAddPairsGroupedInt32x8
+ OpConcatAddPairsGroupedUint16x16
+ OpConcatAddPairsGroupedUint32x8
+ OpConcatAddPairsInt16x8
+ OpConcatAddPairsInt32x4
+ OpConcatAddPairsSaturatedGroupedInt16x16
+ OpConcatAddPairsSaturatedInt16x8
+ OpConcatAddPairsUint16x8
+ OpConcatAddPairsUint32x4
OpConcatPermuteFloat32x4
OpConcatPermuteFloat32x8
OpConcatPermuteFloat32x16
@@ -6388,6 +6388,20 @@
OpConcatPermuteUint64x2
OpConcatPermuteUint64x4
OpConcatPermuteUint64x8
+ OpConcatSubPairsFloat32x4
+ OpConcatSubPairsFloat64x2
+ OpConcatSubPairsGroupedFloat32x8
+ OpConcatSubPairsGroupedFloat64x4
+ OpConcatSubPairsGroupedInt16x16
+ OpConcatSubPairsGroupedInt32x8
+ OpConcatSubPairsGroupedUint16x16
+ OpConcatSubPairsGroupedUint32x8
+ OpConcatSubPairsInt16x8
+ OpConcatSubPairsInt32x4
+ OpConcatSubPairsSaturatedGroupedInt16x16
+ OpConcatSubPairsSaturatedInt16x8
+ OpConcatSubPairsUint16x8
+ OpConcatSubPairsUint32x4
OpConvertToFloat32Float64x2
OpConvertToFloat32Float64x4
OpConvertToFloat32Float64x8
@@ -7180,20 +7194,6 @@
OpSubInt64x2
OpSubInt64x4
OpSubInt64x8
- OpSubPairsFloat32x4
- OpSubPairsFloat64x2
- OpSubPairsGroupedFloat32x8
- OpSubPairsGroupedFloat64x4
- OpSubPairsGroupedInt16x16
- OpSubPairsGroupedInt32x8
- OpSubPairsGroupedUint16x16
- OpSubPairsGroupedUint32x8
- OpSubPairsInt16x8
- OpSubPairsInt32x4
- OpSubPairsSaturatedGroupedInt16x16
- OpSubPairsSaturatedInt16x8
- OpSubPairsUint16x8
- OpSubPairsUint32x4
OpSubSaturatedInt8x16
OpSubSaturatedInt8x32
OpSubSaturatedInt8x64
@@ -89498,76 +89498,6 @@
generic: true,
},
{
- name: "AddPairsFloat32x4",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsFloat64x2",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsGroupedFloat32x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsGroupedFloat64x4",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsGroupedInt16x16",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsGroupedInt32x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsGroupedUint16x16",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsGroupedUint32x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsInt16x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsInt32x4",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsSaturatedGroupedInt16x16",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsSaturatedInt16x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsUint16x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "AddPairsUint32x4",
- argLen: 2,
- generic: true,
- },
- {
name: "AddSaturatedInt8x16",
argLen: 2,
commutative: true,
@@ -90202,6 +90132,76 @@
generic: true,
},
{
+ name: "ConcatAddPairsFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsGroupedFloat32x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsGroupedFloat64x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsGroupedInt16x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsGroupedInt32x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsGroupedUint16x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsGroupedUint32x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsSaturatedGroupedInt16x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsSaturatedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatAddPairsUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
name: "ConcatPermuteFloat32x4",
argLen: 3,
generic: true,
@@ -90352,6 +90352,76 @@
generic: true,
},
{
+ name: "ConcatSubPairsFloat32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsFloat64x2",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsGroupedFloat32x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsGroupedFloat64x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsGroupedInt16x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsGroupedInt32x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsGroupedUint16x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsGroupedUint32x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsInt32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsSaturatedGroupedInt16x16",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsSaturatedInt16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsUint16x8",
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatSubPairsUint32x4",
+ argLen: 2,
+ generic: true,
+ },
+ {
name: "ConvertToFloat32Float64x2",
argLen: 1,
generic: true,
@@ -94474,76 +94544,6 @@
generic: true,
},
{
- name: "SubPairsFloat32x4",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsFloat64x2",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsGroupedFloat32x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsGroupedFloat64x4",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsGroupedInt16x16",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsGroupedInt32x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsGroupedUint16x16",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsGroupedUint32x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsInt16x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsInt32x4",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsSaturatedGroupedInt16x16",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsSaturatedInt16x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsUint16x8",
- argLen: 2,
- generic: true,
- },
- {
- name: "SubPairsUint32x4",
- argLen: 2,
- generic: true,
- },
- {
name: "SubSaturatedInt8x16",
argLen: 2,
generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index c2c09cc..179e492 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2192,48 +2192,6 @@
case OpAddInt8x64:
v.Op = OpAMD64VPADDB512
return true
- case OpAddPairsFloat32x4:
- v.Op = OpAMD64VHADDPS128
- return true
- case OpAddPairsFloat64x2:
- v.Op = OpAMD64VHADDPD128
- return true
- case OpAddPairsGroupedFloat32x8:
- v.Op = OpAMD64VHADDPS256
- return true
- case OpAddPairsGroupedFloat64x4:
- v.Op = OpAMD64VHADDPD256
- return true
- case OpAddPairsGroupedInt16x16:
- v.Op = OpAMD64VPHADDW256
- return true
- case OpAddPairsGroupedInt32x8:
- v.Op = OpAMD64VPHADDD256
- return true
- case OpAddPairsGroupedUint16x16:
- v.Op = OpAMD64VPHADDW256
- return true
- case OpAddPairsGroupedUint32x8:
- v.Op = OpAMD64VPHADDD256
- return true
- case OpAddPairsInt16x8:
- v.Op = OpAMD64VPHADDW128
- return true
- case OpAddPairsInt32x4:
- v.Op = OpAMD64VPHADDD128
- return true
- case OpAddPairsSaturatedGroupedInt16x16:
- v.Op = OpAMD64VPHADDSW256
- return true
- case OpAddPairsSaturatedInt16x8:
- v.Op = OpAMD64VPHADDSW128
- return true
- case OpAddPairsUint16x8:
- v.Op = OpAMD64VPHADDW128
- return true
- case OpAddPairsUint32x4:
- v.Op = OpAMD64VPHADDD128
- return true
case OpAddPtr:
v.Op = OpAMD64ADDQ
return true
@@ -2674,6 +2632,48 @@
return rewriteValueAMD64_OpCompressUint8x32(v)
case OpCompressUint8x64:
return rewriteValueAMD64_OpCompressUint8x64(v)
+ case OpConcatAddPairsFloat32x4:
+ v.Op = OpAMD64VHADDPS128
+ return true
+ case OpConcatAddPairsFloat64x2:
+ v.Op = OpAMD64VHADDPD128
+ return true
+ case OpConcatAddPairsGroupedFloat32x8:
+ v.Op = OpAMD64VHADDPS256
+ return true
+ case OpConcatAddPairsGroupedFloat64x4:
+ v.Op = OpAMD64VHADDPD256
+ return true
+ case OpConcatAddPairsGroupedInt16x16:
+ v.Op = OpAMD64VPHADDW256
+ return true
+ case OpConcatAddPairsGroupedInt32x8:
+ v.Op = OpAMD64VPHADDD256
+ return true
+ case OpConcatAddPairsGroupedUint16x16:
+ v.Op = OpAMD64VPHADDW256
+ return true
+ case OpConcatAddPairsGroupedUint32x8:
+ v.Op = OpAMD64VPHADDD256
+ return true
+ case OpConcatAddPairsInt16x8:
+ v.Op = OpAMD64VPHADDW128
+ return true
+ case OpConcatAddPairsInt32x4:
+ v.Op = OpAMD64VPHADDD128
+ return true
+ case OpConcatAddPairsSaturatedGroupedInt16x16:
+ v.Op = OpAMD64VPHADDSW256
+ return true
+ case OpConcatAddPairsSaturatedInt16x8:
+ v.Op = OpAMD64VPHADDSW128
+ return true
+ case OpConcatAddPairsUint16x8:
+ v.Op = OpAMD64VPHADDW128
+ return true
+ case OpConcatAddPairsUint32x4:
+ v.Op = OpAMD64VPHADDD128
+ return true
case OpConcatPermuteFloat32x16:
v.Op = OpAMD64VPERMI2PS512
return true
@@ -2773,6 +2773,48 @@
case OpConcatShiftBytesRightUint8x16:
v.Op = OpAMD64VPALIGNR128
return true
+ case OpConcatSubPairsFloat32x4:
+ v.Op = OpAMD64VHSUBPS128
+ return true
+ case OpConcatSubPairsFloat64x2:
+ v.Op = OpAMD64VHSUBPD128
+ return true
+ case OpConcatSubPairsGroupedFloat32x8:
+ v.Op = OpAMD64VHSUBPS256
+ return true
+ case OpConcatSubPairsGroupedFloat64x4:
+ v.Op = OpAMD64VHSUBPD256
+ return true
+ case OpConcatSubPairsGroupedInt16x16:
+ v.Op = OpAMD64VPHSUBW256
+ return true
+ case OpConcatSubPairsGroupedInt32x8:
+ v.Op = OpAMD64VPHSUBD256
+ return true
+ case OpConcatSubPairsGroupedUint16x16:
+ v.Op = OpAMD64VPHSUBW256
+ return true
+ case OpConcatSubPairsGroupedUint32x8:
+ v.Op = OpAMD64VPHSUBD256
+ return true
+ case OpConcatSubPairsInt16x8:
+ v.Op = OpAMD64VPHSUBW128
+ return true
+ case OpConcatSubPairsInt32x4:
+ v.Op = OpAMD64VPHSUBD128
+ return true
+ case OpConcatSubPairsSaturatedGroupedInt16x16:
+ v.Op = OpAMD64VPHSUBSW256
+ return true
+ case OpConcatSubPairsSaturatedInt16x8:
+ v.Op = OpAMD64VPHSUBSW128
+ return true
+ case OpConcatSubPairsUint16x8:
+ v.Op = OpAMD64VPHSUBW128
+ return true
+ case OpConcatSubPairsUint32x4:
+ v.Op = OpAMD64VPHSUBD128
+ return true
case OpCondSelect:
return rewriteValueAMD64_OpCondSelect(v)
case OpConst16:
@@ -5855,48 +5897,6 @@
case OpSubInt8x64:
v.Op = OpAMD64VPSUBB512
return true
- case OpSubPairsFloat32x4:
- v.Op = OpAMD64VHSUBPS128
- return true
- case OpSubPairsFloat64x2:
- v.Op = OpAMD64VHSUBPD128
- return true
- case OpSubPairsGroupedFloat32x8:
- v.Op = OpAMD64VHSUBPS256
- return true
- case OpSubPairsGroupedFloat64x4:
- v.Op = OpAMD64VHSUBPD256
- return true
- case OpSubPairsGroupedInt16x16:
- v.Op = OpAMD64VPHSUBW256
- return true
- case OpSubPairsGroupedInt32x8:
- v.Op = OpAMD64VPHSUBD256
- return true
- case OpSubPairsGroupedUint16x16:
- v.Op = OpAMD64VPHSUBW256
- return true
- case OpSubPairsGroupedUint32x8:
- v.Op = OpAMD64VPHSUBD256
- return true
- case OpSubPairsInt16x8:
- v.Op = OpAMD64VPHSUBW128
- return true
- case OpSubPairsInt32x4:
- v.Op = OpAMD64VPHSUBD128
- return true
- case OpSubPairsSaturatedGroupedInt16x16:
- v.Op = OpAMD64VPHSUBSW256
- return true
- case OpSubPairsSaturatedInt16x8:
- v.Op = OpAMD64VPHSUBSW128
- return true
- case OpSubPairsUint16x8:
- v.Op = OpAMD64VPHSUBW128
- return true
- case OpSubPairsUint32x4:
- v.Op = OpAMD64VPHSUBD128
- return true
case OpSubPtr:
v.Op = OpAMD64SUBQ
return true
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index e790254..5c94e17 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -66,20 +66,6 @@
addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
@@ -196,6 +182,20 @@
addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsUint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ConcatAddPairs", opLen2(ssa.OpConcatAddPairsUint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float32x8.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x4.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x16.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x8.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ConcatAddPairsGrouped", opLen2(ssa.OpConcatAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.ConcatAddPairsSaturated", opLen2(ssa.OpConcatAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x16.ConcatAddPairsSaturatedGrouped", opLen2(ssa.OpConcatAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
@@ -229,6 +229,20 @@
addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Float32x4.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x2.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x8.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsUint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ConcatSubPairs", opLen2(ssa.OpConcatSubPairsUint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float32x8.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x4.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x16.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x8.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ConcatSubPairsGrouped", opLen2(ssa.OpConcatSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x8.ConcatSubPairsSaturated", opLen2(ssa.OpConcatSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x16.ConcatSubPairsSaturatedGrouped", opLen2(ssa.OpConcatSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x8.ConvertToFloat32", opLen1(ssa.OpConvertToFloat32Float64x8, types.TypeVec256), sys.AMD64)
@@ -1154,20 +1168,6 @@
addF(simdPackage, "Uint64x2.Sub", opLen2(ssa.OpSubUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
index ac5bd82..32c26d0 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
@@ -15,45 +15,45 @@
commutative: false
documentation: !string |-
// NAME subtracts corresponding elements of two vectors with saturation.
-- go: AddPairs
+- go: ConcatAddPairs
commutative: false
out:
- elemBits: 16|32
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: AddPairs
+- go: ConcatAddPairs
commutative: false
out:
- elemBits: 64
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-- go: SubPairs
+- go: ConcatSubPairs
commutative: false
out:
- elemBits: 16|32
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-- go: SubPairs
+- go: ConcatSubPairs
commutative: false
out:
- elemBits: 64
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-- go: AddPairsSaturated
+- go: ConcatAddPairsSaturated
commutative: false
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: SubPairsSaturated
+- go: ConcatSubPairsSaturated
commutative: false
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
commutative: false
out:
- elemBits: 16|32
@@ -61,7 +61,7 @@
// NAME horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
commutative: false
out:
- elemBits: 64
@@ -69,7 +69,7 @@
// NAME horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
commutative: false
out:
- elemBits: 16|32
@@ -77,7 +77,7 @@
// NAME horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
commutative: false
out:
- elemBits: 64
@@ -85,13 +85,13 @@
// NAME horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-- go: AddPairsSaturatedGrouped
+- go: ConcatAddPairsSaturatedGrouped
commutative: false
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements with saturation.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-- go: SubPairsSaturatedGrouped
+- go: ConcatSubPairsSaturatedGrouped
commutative: false
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements with saturation.
diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
index 17cee59..e834562 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
@@ -51,7 +51,7 @@
- *uint
out:
- *uint
-- go: AddPairs
+- go: ConcatAddPairs
asm: "VPHADD[DW]"
in: &2any128
- &any128
@@ -60,19 +60,19 @@
- *any128
out: &1any128
- *any128
-- go: SubPairs
+- go: ConcatSubPairs
asm: "VPHSUB[DW]"
in: *2any128
out: *1any128
-- go: AddPairs
+- go: ConcatAddPairs
asm: "VHADDP[SD]" # floats
in: *2any128
out: *1any128
-- go: SubPairs
+- go: ConcatSubPairs
asm: "VHSUBP[SD]" # floats
in: *2any128
out: *1any128
-- go: AddPairsSaturated
+- go: ConcatAddPairsSaturated
asm: "VPHADDS[DW]"
in: &2int128
- &int128
@@ -82,11 +82,11 @@
- *int128
out: &1int128
- *int128
-- go: SubPairsSaturated
+- go: ConcatSubPairsSaturated
asm: "VPHSUBS[DW]"
in: *2int128
out: *1int128
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
asm: "VPHADD[DW]"
in: &2any256
- &any256
@@ -95,19 +95,19 @@
- *any256
out: &1any256
- *any256
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
asm: "VPHSUB[DW]"
in: *2any256
out: *1any256
-- go: AddPairsGrouped
+- go: ConcatAddPairsGrouped
asm: "VHADDP[SD]" # floats
in: *2any256
out: *1any256
-- go: SubPairsGrouped
+- go: ConcatSubPairsGrouped
asm: "VHSUBP[SD]" # floats
in: *2any256
out: *1any256
-- go: AddPairsSaturatedGrouped
+- go: ConcatAddPairsSaturatedGrouped
asm: "VPHADDS[DW]"
in: &2int256
- &int256
@@ -117,7 +117,7 @@
- *int256
out: &1int256
- *int256
-- go: SubPairsSaturatedGrouped
+- go: ConcatSubPairsSaturatedGrouped
asm: "VPHSUBS[DW]"
in: *2int256
out: *1int256
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 71366f6..224d0bf 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -1329,33 +1329,33 @@
}

func TestAddSubPairs(t *testing.T) {
- testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
- testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
- testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
- testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
- testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
- testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
- testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
- testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
- testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
- testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
- testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
- testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
+ testInt16x8Binary(t, archsimd.Int16x8.ConcatAddPairs, addPairsSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.ConcatSubPairs, subPairsSlice[int16])
+ testUint16x8Binary(t, archsimd.Uint16x8.ConcatAddPairs, addPairsSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.ConcatSubPairs, subPairsSlice[uint16])
+ testInt32x4Binary(t, archsimd.Int32x4.ConcatAddPairs, addPairsSlice[int32])
+ testInt32x4Binary(t, archsimd.Int32x4.ConcatSubPairs, subPairsSlice[int32])
+ testUint32x4Binary(t, archsimd.Uint32x4.ConcatAddPairs, addPairsSlice[uint32])
+ testUint32x4Binary(t, archsimd.Uint32x4.ConcatSubPairs, subPairsSlice[uint32])
+ testFloat32x4Binary(t, archsimd.Float32x4.ConcatAddPairs, addPairsSlice[float32])
+ testFloat32x4Binary(t, archsimd.Float32x4.ConcatSubPairs, subPairsSlice[float32])
+ testFloat64x2Binary(t, archsimd.Float64x2.ConcatAddPairs, addPairsSlice[float64])
+ testFloat64x2Binary(t, archsimd.Float64x2.ConcatSubPairs, subPairsSlice[float64])

// Grouped versions
if archsimd.X86.AVX2() {
- testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
- testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
- testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
- testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
- testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
- testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
- testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
- testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
- testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
- testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
- testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
- testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
+ testInt16x16Binary(t, archsimd.Int16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[int16])
+ testInt16x16Binary(t, archsimd.Int16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[int16])
+ testUint16x16Binary(t, archsimd.Uint16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[uint16])
+ testUint16x16Binary(t, archsimd.Uint16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[uint16])
+ testInt32x8Binary(t, archsimd.Int32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[int32])
+ testUint32x8Binary(t, archsimd.Uint32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[uint32])
+ testFloat32x8Binary(t, archsimd.Float32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[float32])
+ testFloat32x8Binary(t, archsimd.Float32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[float32])
+ testFloat64x4Binary(t, archsimd.Float64x4.ConcatAddPairsGrouped, addPairsGroupedSlice[float64])
+ testFloat64x4Binary(t, archsimd.Float64x4.ConcatSubPairsGrouped, subPairsGroupedSlice[float64])
}
}

diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index b105206..5cfe5fb 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -346,105 +346,6 @@
// Asm: VPADDQ, CPU Feature: AVX512
func (x Uint64x8) Add(y Uint64x8) Uint64x8

-/* AddPairs */
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x4) AddPairs(y Float32x4) Float32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x2) AddPairs(y Float64x2) Float64x2
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) AddPairs(y Int16x8) Int16x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) AddPairs(y Int32x4) Int32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
-
-/* AddPairsGrouped */
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16
-
-// AddPairsGrouped horizontally adds adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8
-
-/* AddPairsSaturated */
-
-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX
-func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
-
-/* AddPairsSaturatedGrouped */
-
-// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16
-
/* AddSaturated */

// AddSaturated adds corresponding elements of two vectors with saturation.
@@ -1097,6 +998,105 @@
// Asm: VPCOMPRESSQ, CPU Feature: AVX512
func (x Uint64x8) Compress(mask Mask64x8) Uint64x8

+/* ConcatAddPairs */
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x4) ConcatAddPairs(y Float32x4) Float32x4
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x2) ConcatAddPairs(y Float64x2) Float64x2
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) ConcatAddPairs(y Int16x8) Int16x8
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) ConcatAddPairs(y Int32x4) Int32x4
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) ConcatAddPairs(y Uint16x8) Uint16x8
+
+// ConcatAddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Uint32x4) ConcatAddPairs(y Uint32x4) Uint32x4
+
+/* ConcatAddPairsGrouped */
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) ConcatAddPairsGrouped(y Float32x8) Float32x8
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) ConcatAddPairsGrouped(y Float64x4) Float64x4
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Int16x16) ConcatAddPairsGrouped(y Int16x16) Int16x16
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) ConcatAddPairsGrouped(y Int32x8) Int32x8
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) ConcatAddPairsGrouped(y Uint16x16) Uint16x16
+
+// ConcatAddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Uint32x8) ConcatAddPairsGrouped(y Uint32x8) Uint32x8
+
+/* ConcatAddPairsSaturated */
+
+// ConcatAddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX
+func (x Int16x8) ConcatAddPairsSaturated(y Int16x8) Int16x8
+
+/* ConcatAddPairsSaturatedGrouped */
+
+// ConcatAddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX2
+func (x Int16x16) ConcatAddPairsSaturatedGrouped(y Int16x16) Int16x16
+
/* ConcatPermute */

// ConcatPermute performs a full permutation of vector x, y using indices:
@@ -1429,6 +1429,105 @@
// Asm: VPALIGNR, CPU Feature: AVX512
func (x Uint8x64) ConcatShiftBytesRightGrouped(y Uint8x64, shift uint8) Uint8x64

+/* ConcatSubPairs */
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x4) ConcatSubPairs(y Float32x4) Float32x4
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x2) ConcatSubPairs(y Float64x2) Float64x2
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) ConcatSubPairs(y Int16x8) Int16x8
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) ConcatSubPairs(y Int32x4) Int32x4
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) ConcatSubPairs(y Uint16x8) Uint16x8
+
+// ConcatSubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Uint32x4) ConcatSubPairs(y Uint32x4) Uint32x4
+
+/* ConcatSubPairsGrouped */
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) ConcatSubPairsGrouped(y Float32x8) Float32x8
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) ConcatSubPairsGrouped(y Float64x4) Float64x4
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Int16x16) ConcatSubPairsGrouped(y Int16x16) Int16x16
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) ConcatSubPairsGrouped(y Int32x8) Int32x8
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) ConcatSubPairsGrouped(y Uint16x16) Uint16x16
+
+// ConcatSubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Uint32x8) ConcatSubPairsGrouped(y Uint32x8) Uint32x8
+
+/* ConcatSubPairsSaturated */
+
+// ConcatSubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX
+func (x Int16x8) ConcatSubPairsSaturated(y Int16x8) Int16x8
+
+/* ConcatSubPairsSaturatedGrouped */
+
+// ConcatSubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX2
+func (x Int16x16) ConcatSubPairsSaturatedGrouped(y Int16x16) Int16x16
+
/* ConvertToFloat32 */

// ConvertToFloat32 converts element values to float32.
@@ -7170,105 +7269,6 @@
// Asm: VPSUBQ, CPU Feature: AVX512
func (x Uint64x8) Sub(y Uint64x8) Uint64x8

-/* SubPairs */
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x4) SubPairs(y Float32x4) Float32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x2) SubPairs(y Float64x2) Float64x2
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) SubPairs(y Int16x8) Int16x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) SubPairs(y Int32x4) Int32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
-
-/* SubPairsGrouped */
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16
-
-// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8
-
-/* SubPairsSaturated */
-
-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX
-func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
-
-/* SubPairsSaturatedGrouped */
-
-// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
-// With each 128-bit as a group:
-// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16
-
/* SubSaturated */

// SubSaturated subtracts corresponding elements of two vectors with saturation.

Change information

Files:
  • M src/cmd/compile/internal/amd64/simdssa.go
  • M src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
  • M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
  • M src/cmd/compile/internal/ssa/opGen.go
  • M src/cmd/compile/internal/ssa/rewriteAMD64.go
  • M src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
  • M src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
  • M src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
  • M src/simd/archsimd/internal/simd_test/simd_test.go
  • M src/simd/archsimd/ops_amd64.go
Change size: XL
Delta: 10 files changed, 602 insertions(+), 602 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: I7a3f6fff272bc7630c116dc265d1677c0187d12a
Gerrit-Change-Number: 777000
Gerrit-PatchSet: 1
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

Junyang Shao (Gerrit)

unread,
5:11 PM (6 hours ago) 5:11 PM
to goph...@pubsubhelper.golang.org, David Chase, golang-co...@googlegroups.com
Attention needed from David Chase

Junyang Shao voted Commit-Queue+1

Commit-Queue+1
Open in Gerrit

Related details

Attention is currently required from:
  • David Chase
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: comment
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: I7a3f6fff272bc7630c116dc265d1677c0187d12a
Gerrit-Change-Number: 777000
Gerrit-PatchSet: 2
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
Gerrit-Reviewer: David Chase <drc...@google.com>
Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
Gerrit-Attention: David Chase <drc...@google.com>
Gerrit-Comment-Date: Mon, 11 May 2026 21:11:14 +0000
Gerrit-HasComments: No
Gerrit-Has-Labels: Yes
unsatisfied_requirement
satisfied_requirement
open
diffy
Reply all
Reply to author
Forward
0 new messages