[go/dev.simd] [dev.simd] simd, cmd/compile: change Select(128)?FromPair names

1 view
Skip to first unread message

Junyang Shao (Gerrit)

unread,
5:09 PM (6 hours ago) 5:09 PM
to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Junyang Shao has uploaded the change for review

Commit message

[dev.simd] simd, cmd/compile: change Select(128)?FromPair names

This CL changes Select(128)?FromPair to ConcatPermute(128)?Scalars

For #78979.
Change-Id: Ida086183afad589e6c457c7f5fff508a71a5f2dd

Change diff

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 402e711..9a0febe 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1211,7 +1211,9 @@
ssa.OpAMD64VPSRLQMasked512const:
p = simdVkvImm8(s, v)

- case ssa.OpAMD64VPALIGNR128,
+ case ssa.OpAMD64VPERM2F128256,
+ ssa.OpAMD64VPERM2I128256,
+ ssa.OpAMD64VPALIGNR128,
ssa.OpAMD64VPALIGNR256,
ssa.OpAMD64VPALIGNR512,
ssa.OpAMD64VCMPPS128,
@@ -1224,8 +1226,6 @@
ssa.OpAMD64VGF2P8AFFINEINVQB128,
ssa.OpAMD64VGF2P8AFFINEINVQB256,
ssa.OpAMD64VGF2P8AFFINEINVQB512,
- ssa.OpAMD64VPERM2F128256,
- ssa.OpAMD64VPERM2I128256,
ssa.OpAMD64VINSERTF128256,
ssa.OpAMD64VINSERTF64X4512,
ssa.OpAMD64VINSERTI128256,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index cdbedbc..fc3461e 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -216,6 +216,16 @@
(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
+(ConcatPermute128ScalarsFloat32x8 ...) => (VPERM2F128256 ...)
+(ConcatPermute128ScalarsFloat64x4 ...) => (VPERM2F128256 ...)
+(ConcatPermute128ScalarsInt8x32 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsInt16x16 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsInt32x8 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsInt64x4 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint8x32 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint16x16 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint32x8 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint64x4 ...) => (VPERM2I128256 ...)
(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@@ -918,16 +928,6 @@
(ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
(ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
(ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
-(Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...)
-(Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...)
-(Select128FromPairInt8x32 ...) => (VPERM2I128256 ...)
-(Select128FromPairInt16x16 ...) => (VPERM2I128256 ...)
-(Select128FromPairInt32x8 ...) => (VPERM2I128256 ...)
-(Select128FromPairInt64x4 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint8x32 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint16x16 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint32x8 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint64x4 ...) => (VPERM2I128256 ...)
(SetElemFloat32x4 ...) => (VPINSRD128 ...)
(SetElemFloat64x2 ...) => (VPINSRQ128 ...)
(SetElemInt8x16 ...) => (VPINSRB128 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 3644bbc..c9f7929 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1183,6 +1183,16 @@
{name: "CeilScaledResidueFloat64x2", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "CeilScaledResidueFloat64x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "CeilScaledResidueFloat64x8", argLength: 1, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsFloat32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsFloat64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsInt8x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsInt16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsInt32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsInt64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsUint8x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsUint16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsUint32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
+ {name: "ConcatPermute128ScalarsUint64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "ConcatShiftBytesRightGroupedUint8x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "ConcatShiftBytesRightGroupedUint8x64", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "ConcatShiftBytesRightUint8x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
@@ -1251,16 +1261,6 @@
{name: "RoundScaledResidueFloat64x4", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "RoundScaledResidueFloat64x8", argLength: 1, aux: "UInt8"}, // ARCH:amd64
{name: "SHA1FourRoundsUint32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairFloat32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairFloat64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairInt8x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairInt16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairInt32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairInt64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairUint8x32", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairUint16x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairUint32x8", argLength: 2, aux: "UInt8"}, // ARCH:amd64
- {name: "Select128FromPairUint64x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "SetElemFloat32x4", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "SetElemFloat64x2", argLength: 2, aux: "UInt8"}, // ARCH:amd64
{name: "SetElemInt8x16", argLength: 2, aux: "UInt8"}, // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 52a6b39..2d375d4 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -7364,6 +7364,16 @@
OpCeilScaledResidueFloat64x2
OpCeilScaledResidueFloat64x4
OpCeilScaledResidueFloat64x8
+ OpConcatPermute128ScalarsFloat32x8
+ OpConcatPermute128ScalarsFloat64x4
+ OpConcatPermute128ScalarsInt8x32
+ OpConcatPermute128ScalarsInt16x16
+ OpConcatPermute128ScalarsInt32x8
+ OpConcatPermute128ScalarsInt64x4
+ OpConcatPermute128ScalarsUint8x32
+ OpConcatPermute128ScalarsUint16x16
+ OpConcatPermute128ScalarsUint32x8
+ OpConcatPermute128ScalarsUint64x4
OpConcatShiftBytesRightGroupedUint8x32
OpConcatShiftBytesRightGroupedUint8x64
OpConcatShiftBytesRightUint8x16
@@ -7432,16 +7442,6 @@
OpRoundScaledResidueFloat64x4
OpRoundScaledResidueFloat64x8
OpSHA1FourRoundsUint32x4
- OpSelect128FromPairFloat32x8
- OpSelect128FromPairFloat64x4
- OpSelect128FromPairInt8x32
- OpSelect128FromPairInt16x16
- OpSelect128FromPairInt32x8
- OpSelect128FromPairInt64x4
- OpSelect128FromPairUint8x32
- OpSelect128FromPairUint16x16
- OpSelect128FromPairUint32x8
- OpSelect128FromPairUint64x4
OpSetElemFloat32x4
OpSetElemFloat64x2
OpSetElemInt8x16
@@ -95431,6 +95431,66 @@
generic: true,
},
{
+ name: "ConcatPermute128ScalarsFloat32x8",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsFloat64x4",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsInt8x32",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsInt16x16",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsInt32x8",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsInt64x4",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsUint8x32",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsUint16x16",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsUint32x8",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatPermute128ScalarsUint64x4",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
name: "ConcatShiftBytesRightGroupedUint8x32",
auxType: auxUInt8,
argLen: 2,
@@ -95839,66 +95899,6 @@
generic: true,
},
{
- name: "Select128FromPairFloat32x8",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairFloat64x4",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairInt8x32",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairInt16x16",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairInt32x8",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairInt64x4",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairUint8x32",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairUint16x16",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairUint32x8",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
- name: "Select128FromPairUint64x4",
- auxType: auxUInt8,
- argLen: 2,
- generic: true,
- },
- {
name: "SetElemFloat32x4",
auxType: auxUInt8,
argLen: 2,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 179e492..73533cb 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2674,6 +2674,36 @@
case OpConcatAddPairsUint32x4:
v.Op = OpAMD64VPHADDD128
return true
+ case OpConcatPermute128ScalarsFloat32x8:
+ v.Op = OpAMD64VPERM2F128256
+ return true
+ case OpConcatPermute128ScalarsFloat64x4:
+ v.Op = OpAMD64VPERM2F128256
+ return true
+ case OpConcatPermute128ScalarsInt16x16:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsInt32x8:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsInt64x4:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsInt8x32:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsUint16x16:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsUint32x8:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsUint64x4:
+ v.Op = OpAMD64VPERM2I128256
+ return true
+ case OpConcatPermute128ScalarsUint8x32:
+ v.Op = OpAMD64VPERM2I128256
+ return true
case OpConcatPermuteFloat32x16:
v.Op = OpAMD64VPERMI2PS512
return true
@@ -5190,36 +5220,6 @@
return rewriteValueAMD64_OpSelect0(v)
case OpSelect1:
return rewriteValueAMD64_OpSelect1(v)
- case OpSelect128FromPairFloat32x8:
- v.Op = OpAMD64VPERM2F128256
- return true
- case OpSelect128FromPairFloat64x4:
- v.Op = OpAMD64VPERM2F128256
- return true
- case OpSelect128FromPairInt16x16:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairInt32x8:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairInt64x4:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairInt8x32:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairUint16x16:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairUint32x8:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairUint64x4:
- v.Op = OpAMD64VPERM2I128256
- return true
- case OpSelect128FromPairUint8x32:
- v.Op = OpAMD64VPERM2I128256
- return true
case OpSelectN:
return rewriteValueAMD64_OpSelectN(v)
case OpSetElemFloat32x4:
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index 5c94e17..7917b19 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -226,6 +226,16 @@
addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x8.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsFloat32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Float64x4.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsFloat64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int8x32.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt8x32, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int32x8.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int64x4.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint8x32.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint8x32, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@@ -928,16 +938,6 @@
addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt8x32, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint8x32, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 70c8178..43a0743 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -255,7 +255,7 @@
// This differs from the same method applied to a 32x8 or 32x16 vector, where
// the 8-bit constant performs the same selection on all the subvectors.

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
commutative: false
documentation: !string |-
// NAME treats the 256-bit vectors x and y as a single vector of four
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index 927f88c..ae64e4e 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -880,7 +880,7 @@
out:
- *v

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
asm: VPERM2F128
operandOrder: II
addDoc: !string |-
@@ -903,7 +903,7 @@
out:
- *v

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
asm: VPERM2F128
operandOrder: II
addDoc: !string |-
@@ -927,7 +927,7 @@
out:
- *v

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
asm: VPERM2I128
operandOrder: II
addDoc: !string |-
@@ -951,7 +951,7 @@
out:
- *v

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
asm: VPERM2I128
operandOrder: II
addDoc: !string |-
@@ -975,7 +975,7 @@
out:
- *v

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
asm: VPERM2I128
operandOrder: II
addDoc: !string |-
@@ -1000,7 +1000,7 @@
out:
- *v

-- go: Select128FromPair
+- go: ConcatPermute128Scalars
asm: VPERM2I128
operandOrder: II
addDoc: !string |-
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 224d0bf..d3148dc 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -666,25 +666,25 @@
x := archsimd.LoadInt32x4([]int32{0, 1, 2, 3})
y := archsimd.LoadInt32x4([]int32{4, 5, 6, 7})

- llll := x.SelectFromPair(0, 1, 2, 3, y)
- hhhh := x.SelectFromPair(4, 5, 6, 7, y)
- llhh := x.SelectFromPair(0, 1, 6, 7, y)
- hhll := x.SelectFromPair(6, 7, 0, 1, y)
+ llll := x.ConcatPermuteScalars(0, 1, 2, 3, y)
+ hhhh := x.ConcatPermuteScalars(4, 5, 6, 7, y)
+ llhh := x.ConcatPermuteScalars(0, 1, 6, 7, y)
+ hhll := x.ConcatPermuteScalars(6, 7, 0, 1, y)

- lllh := x.SelectFromPair(0, 1, 2, 7, y)
- llhl := x.SelectFromPair(0, 1, 7, 2, y)
- lhll := x.SelectFromPair(0, 7, 1, 2, y)
- hlll := x.SelectFromPair(7, 0, 1, 2, y)
+ lllh := x.ConcatPermuteScalars(0, 1, 2, 7, y)
+ llhl := x.ConcatPermuteScalars(0, 1, 7, 2, y)
+ lhll := x.ConcatPermuteScalars(0, 7, 1, 2, y)
+ hlll := x.ConcatPermuteScalars(7, 0, 1, 2, y)

- hhhl := x.SelectFromPair(4, 5, 6, 0, y)
- hhlh := x.SelectFromPair(4, 5, 0, 6, y)
- hlhh := x.SelectFromPair(4, 0, 5, 6, y)
- lhhh := x.SelectFromPair(0, 4, 5, 6, y)
+ hhhl := x.ConcatPermuteScalars(4, 5, 6, 0, y)
+ hhlh := x.ConcatPermuteScalars(4, 5, 0, 6, y)
+ hlhh := x.ConcatPermuteScalars(4, 0, 5, 6, y)
+ lhhh := x.ConcatPermuteScalars(0, 4, 5, 6, y)

- lhlh := x.SelectFromPair(0, 4, 1, 5, y)
- hlhl := x.SelectFromPair(4, 0, 5, 1, y)
- lhhl := x.SelectFromPair(0, 4, 5, 1, y)
- hllh := x.SelectFromPair(4, 0, 1, 5, y)
+ lhlh := x.ConcatPermuteScalars(0, 4, 1, 5, y)
+ hlhl := x.ConcatPermuteScalars(4, 0, 5, 1, y)
+ lhhl := x.ConcatPermuteScalars(0, 4, 5, 1, y)
+ hllh := x.ConcatPermuteScalars(4, 0, 1, 5, y)

r := make([]int32, 4, 4)

@@ -716,7 +716,7 @@

//go:noinline
func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
- return x.SelectFromPair(a, b, c, d, y)
+ return x.ConcatPermuteScalars(a, b, c, d, y)
}

func TestSelect4FromPairVar(t *testing.T) {
@@ -775,25 +775,25 @@
x := archsimd.LoadFloat32x8([]float32{0, 1, 2, 3, 10, 11, 12, 13})
y := archsimd.LoadFloat32x8([]float32{4, 5, 6, 7, 14, 15, 16, 17})

- llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
- hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
- llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
- hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+ llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
+ hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
+ llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
+ hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)

- lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
- llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
- lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
- hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+ lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
+ llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
+ lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
+ hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)

- hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
- hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
- hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
- lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+ hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
+ hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
+ hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
+ lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)

- lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
- hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
- lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
- hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+ lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
+ hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
+ lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
+ hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)

r := make([]float32, 8, 8)

@@ -823,7 +823,7 @@
foo(hllh, 4, 0, 1, 5)
}

-func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
+func TestConcatPermuteScalarsConstGroupedUint32x16(t *testing.T) {
if !archsimd.X86.AVX512() {
t.Skip("Test requires X86.AVX512, not available on this hardware")
return
@@ -831,25 +831,25 @@
x := archsimd.LoadUint32x16([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
y := archsimd.LoadUint32x16([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})

- llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
- hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
- llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
- hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+ llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
+ hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
+ llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
+ hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)

- lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
- llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
- lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
- hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+ lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
+ llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
+ lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
+ hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)

- hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
- hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
- hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
- lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+ hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
+ hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
+ hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
+ lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)

- lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
- hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
- lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
- hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+ lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
+ hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
+ lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
+ hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)

r := make([]uint32, 16, 16)

@@ -883,16 +883,16 @@
foo(hllh, 4, 0, 1, 5)
}

-func TestSelect128FromPair(t *testing.T) {
+func TestConcatPermute128Scalars(t *testing.T) {
x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})

- aa := x.Select128FromPair(0, 0, y)
- ab := x.Select128FromPair(0, 1, y)
- bc := x.Select128FromPair(1, 2, y)
- cd := x.Select128FromPair(2, 3, y)
- da := x.Select128FromPair(3, 0, y)
- dc := x.Select128FromPair(3, 2, y)
+ aa := x.ConcatPermute128Scalars(0, 0, y)
+ ab := x.ConcatPermute128Scalars(0, 1, y)
+ bc := x.ConcatPermute128Scalars(1, 2, y)
+ cd := x.ConcatPermute128Scalars(2, 3, y)
+ da := x.ConcatPermute128Scalars(3, 0, y)
+ dc := x.ConcatPermute128Scalars(3, 2, y)

r := make([]uint64, 4, 4)

@@ -910,7 +910,7 @@
foo(dc, 3, 2)
}

-func TestSelect128FromPairError(t *testing.T) {
+func TestConcatPermute128ScalarsError(t *testing.T) {
x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})

@@ -919,17 +919,17 @@
t.Logf("Saw expected panic %v", r)
}
}()
- _ = x.Select128FromPair(0, 4, y)
+ _ = x.ConcatPermute128Scalars(0, 4, y)

t.Errorf("Should have panicked")
}

//go:noinline
func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
- return x.Select128FromPair(lo, hi, y)
+ return x.ConcatPermute128Scalars(lo, hi, y)
}

-func TestSelect128FromPairVar(t *testing.T) {
+func TestConcatPermute128ScalarsVar(t *testing.T) {
x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})

@@ -960,10 +960,10 @@
x := archsimd.LoadUint64x2([]uint64{0, 1})
y := archsimd.LoadUint64x2([]uint64{2, 3})

- ll := x.SelectFromPair(0, 1, y)
- hh := x.SelectFromPair(3, 2, y)
- lh := x.SelectFromPair(0, 3, y)
- hl := x.SelectFromPair(2, 1, y)
+ ll := x.ConcatPermuteScalars(0, 1, y)
+ hh := x.ConcatPermuteScalars(3, 2, y)
+ lh := x.ConcatPermuteScalars(0, 3, y)
+ hl := x.ConcatPermuteScalars(2, 1, y)

r := make([]uint64, 2, 2)

@@ -982,10 +982,10 @@
x := archsimd.LoadUint64x4([]uint64{0, 1, 10, 11})
y := archsimd.LoadUint64x4([]uint64{2, 3, 12, 13})

- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
+ ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+ hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+ lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+ hl := x.ConcatPermuteScalarsGrouped(2, 1, y)

r := make([]uint64, 4, 4)

@@ -1004,10 +1004,10 @@
x := archsimd.LoadFloat64x4([]float64{0, 1, 10, 11})
y := archsimd.LoadFloat64x4([]float64{2, 3, 12, 13})

- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
+ ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+ hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+ lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+ hl := x.ConcatPermuteScalarsGrouped(2, 1, y)

r := make([]float64, 4, 4)

@@ -1026,10 +1026,10 @@
x := archsimd.LoadInt64x4([]int64{0, 1, 10, 11})
y := archsimd.LoadInt64x4([]int64{2, 3, 12, 13})

- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
+ ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+ hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+ lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+ hl := x.ConcatPermuteScalarsGrouped(2, 1, y)

r := make([]int64, 4, 4)

@@ -1053,10 +1053,10 @@
x := archsimd.LoadInt64x8([]int64{0, 1, 10, 11, 20, 21, 30, 31})
y := archsimd.LoadInt64x8([]int64{2, 3, 12, 13, 22, 23, 32, 33})

- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
+ ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+ hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+ lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+ hl := x.ConcatPermuteScalarsGrouped(2, 1, y)

r := make([]int64, 8, 8)

diff --git a/src/simd/archsimd/internal/simd_test/transpose_test.go b/src/simd/archsimd/internal/simd_test/transpose_test.go
index abd0706..faa87a7 100644
--- a/src/simd/archsimd/internal/simd_test/transpose_test.go
+++ b/src/simd/archsimd/internal/simd_test/transpose_test.go
@@ -27,10 +27,10 @@
// C3G7
// D4H8

- b0 = t0.SelectFromPair(0, 1, 4, 5, t2) // lower elements from each
- b1 = t0.SelectFromPair(2, 3, 6, 7, t2) // upper elements from each
- b2 = t1.SelectFromPair(0, 1, 4, 5, t3) // lowers
- b3 = t1.SelectFromPair(2, 3, 6, 7, t3) // uppers
+ b0 = t0.ConcatPermuteScalars(0, 1, 4, 5, t2) // lower elements from each
+ b1 = t0.ConcatPermuteScalars(2, 3, 6, 7, t2) // upper elements from each
+ b2 = t1.ConcatPermuteScalars(0, 1, 4, 5, t3) // lowers
+ b3 = t1.ConcatPermuteScalars(2, 3, 6, 7, t3) // uppers
return
}

@@ -51,29 +51,29 @@
// C3G7
// D4H8

- a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2) // lower elements from each
- a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2) // upper elements from each
- a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3) // lowers
- a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3) // uppers
+ a0 = t0.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t2) // lower elements from each
+ a1 = t0.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t2) // upper elements from each
+ a2 = t1.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t3) // lowers
+ a3 = t1.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t3) // uppers

- a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6) // lower elements from each
- a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6) // upper elements from each
- a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7) // lowers
- a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7) // uppers
+ a4 = t4.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t6) // lower elements from each
+ a5 = t4.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t6) // upper elements from each
+ a6 = t5.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t7) // lowers
+ a7 = t5.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t7) // uppers

// next need to swap the upper 128 bits of a0-a3 with the lower 128 bits of a4-a7

- b0 = a0.Select128FromPair(0, 2, a4)
- b4 = a0.Select128FromPair(1, 3, a4)
+ b0 = a0.ConcatPermute128Scalars(0, 2, a4)
+ b4 = a0.ConcatPermute128Scalars(1, 3, a4)

- b1 = a1.Select128FromPair(0, 2, a5)
- b5 = a1.Select128FromPair(1, 3, a5)
+ b1 = a1.ConcatPermute128Scalars(0, 2, a5)
+ b5 = a1.ConcatPermute128Scalars(1, 3, a5)

- b2 = a2.Select128FromPair(0, 2, a6)
- b6 = a2.Select128FromPair(1, 3, a6)
+ b2 = a2.ConcatPermute128Scalars(0, 2, a6)
+ b6 = a2.ConcatPermute128Scalars(1, 3, a6)

- b3 = a3.Select128FromPair(0, 2, a7)
- b7 = a3.Select128FromPair(1, 3, a7)
+ b3 = a3.ConcatPermute128Scalars(0, 2, a7)
+ b7 = a3.ConcatPermute128Scalars(1, 3, a7)

return
}
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index 5cfe5fb..8a720e0 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -1399,6 +1399,172 @@
// Asm: VPERMI2Q, CPU Feature: AVX512
func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8

+/* ConcatPermute128Scalars */
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 50, 51, 52, 53}.ConcatPermute128Scalars(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float32x8) ConcatPermute128Scalars(lo, hi uint8, y Float32x8) Float32x8
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 50, 51}.ConcatPermute128Scalars(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float64x4) ConcatPermute128Scalars(lo, hi uint8, y Float64x4) Float64x4
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.ConcatPermute128Scalars(3, 0,
+// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int8x32) ConcatPermute128Scalars(lo, hi uint8, y Int8x32) Int8x32
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.ConcatPermute128Scalars(3, 0,
+// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int16x16) ConcatPermute128Scalars(lo, hi uint8, y Int16x16) Int16x16
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 50, 51, 52, 53}.ConcatPermute128Scalars(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int32x8) ConcatPermute128Scalars(lo, hi uint8, y Int32x8) Int32x8
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 50, 51}.ConcatPermute128Scalars(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int64x4) ConcatPermute128Scalars(lo, hi uint8, y Int64x4) Int64x4
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.ConcatPermute128Scalars(3, 0,
+// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint8x32) ConcatPermute128Scalars(lo, hi uint8, y Uint8x32) Uint8x32
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.ConcatPermute128Scalars(3, 0,
+// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint16x16) ConcatPermute128Scalars(lo, hi uint8, y Uint16x16) Uint16x16
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 50, 51, 52, 53}.ConcatPermute128Scalars(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint32x8) ConcatPermute128Scalars(lo, hi uint8, y Uint32x8) Uint32x8
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 50, 51}.ConcatPermute128Scalars(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint64x4) ConcatPermute128Scalars(lo, hi uint8, y Uint64x4) Uint64x4
+
/* ConcatShiftBytesRight */

// ConcatShiftBytesRight concatenates x and y and shifts it right by shift bytes.
@@ -5531,172 +5697,6 @@
// Asm: VSCALEFPD, CPU Feature: AVX512
func (x Float64x8) Scale(y Float64x8) Float64x8

-/* Select128FromPair */
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2F128, CPU Feature: AVX
-func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2F128, CPU Feature: AVX
-func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
-// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
-//
-// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int8x32) Select128FromPair(lo, hi uint8, y Int8x32) Int8x32
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
-// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
-//
-// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int16x16) Select128FromPair(lo, hi uint8, y Int16x16) Int16x16
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
-// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
-//
-// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint8x32) Select128FromPair(lo, hi uint8, y Uint8x32) Uint8x32
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
-// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
-//
-// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint16x16) Select128FromPair(lo, hi uint8, y Uint16x16) Uint16x16
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
-
/* SetElem */

// SetElem returns x with the index'th element set to y.
diff --git a/src/simd/archsimd/shuffles_amd64.go b/src/simd/archsimd/shuffles_amd64.go
index 1ca6a15..d4e8d09 100644
--- a/src/simd/archsimd/shuffles_amd64.go
+++ b/src/simd/archsimd/shuffles_amd64.go
@@ -7,7 +7,7 @@
package archsimd

// These constants represent the source pattern for the four parameters
-// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
+// (a, b, c, d) passed to ConcatPermuteScalars and ConcatPermuteScalarsGrouped.
// L means the element comes from the 'x' vector (Low), and
// H means it comes from the 'y' vector (High).
// The order of the letters corresponds to elements a, b, c, d.
@@ -37,7 +37,7 @@
)

// These constants represent the source pattern for the four parameters
-// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
+// (a, b, c, d) passed to ConcatPermuteScalars and ConcatPermuteScalarsGrouped for
// two-element vectors.
const (
_LL = iota
@@ -46,7 +46,7 @@
_HH
)

-// SelectFromPair returns the selection of four elements from the two
+// ConcatPermuteScalars returns the selection of four elements from the two
// vectors x and y, where selector values in the range 0-3 specify
// elements from x and values in the range 4-7 specify the 0-3 elements
// of y. When the selectors are constants and the selection can be
@@ -55,7 +55,7 @@
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output. For example,
//
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+// {1,2,4,8}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81})
//
// returns {4,8,25,81}.
//
@@ -63,7 +63,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
+func (x Int32x4) ConcatPermuteScalars(a, b, c, d uint8, y Int32x4) Int32x4 {
// pattern gets the concatenation of "x or y?" bits
// (0 == x, 1 == y)
// This will determine operand choice/order and whether a second
@@ -128,7 +128,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPair returns the selection of four elements from the two
+// ConcatPermuteScalars returns the selection of four elements from the two
// vectors x and y, where selector values in the range 0-3 specify
// elements from x and values in the range 4-7 specify the 0-3 elements
// of y. When the selectors are constants and can be the selection
@@ -137,7 +137,7 @@
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output. For example,
//
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+// {1,2,4,8}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81})
//
// returns {4,8,25,81}.
//
@@ -145,7 +145,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
+func (x Uint32x4) ConcatPermuteScalars(a, b, c, d uint8, y Uint32x4) Uint32x4 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -203,7 +203,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPair returns the selection of four elements from the two
+// ConcatPermuteScalars returns the selection of four elements from the two
// vectors x and y, where selector values in the range 0-3 specify
// elements from x and values in the range 4-7 specify the 0-3 elements
// of y. When the selectors are constants and can be the selection
@@ -212,7 +212,7 @@
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output. For example,
//
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+// {1,2,4,8}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81})
//
// returns {4,8,25,81}.
//
@@ -220,7 +220,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
+func (x Float32x4) ConcatPermuteScalars(a, b, c, d uint8, y Float32x4) Float32x4 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -278,7 +278,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of four elements from x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
@@ -288,7 +288,7 @@
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output. For example,
//
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+// {1,2,4,8,16,32,64,128}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81,121,169,225,289})
//
// returns {4,8,25,81,64,128,169,289}.
//
@@ -296,7 +296,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
+func (x Int32x8) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -354,7 +354,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of four elements from x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
@@ -364,7 +364,7 @@
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output. For example,
//
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+// {1,2,4,8,16,32,64,128}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81,121,169,225,289})
//
// returns {4,8,25,81,64,128,169,289}.
//
@@ -372,7 +372,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
+func (x Uint32x8) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -430,7 +430,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of four elements from x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
@@ -440,7 +440,7 @@
// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
// elements in the output. For example,
//
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+// {1,2,4,8,16,32,64,128}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81,121,169,225,289})
//
// returns {4,8,25,81,64,128,169,289}.
//
@@ -448,7 +448,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
+func (x Float32x8) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -506,7 +506,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of four elements from x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
@@ -518,7 +518,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
+func (x Int32x16) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -576,7 +576,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of four elements from x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
@@ -588,7 +588,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
+func (x Uint32x16) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -646,7 +646,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of four elements from x and y,
// where selector values in the range 0-3 specify elements from x and
// values in the range 4-7 specify the 0-3 elements of y.
@@ -658,7 +658,7 @@
// call.
//
// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
+func (x Float32x16) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1

a, b, c, d = a&3, b&3, c&3, d&3
@@ -744,7 +744,7 @@
return g + g<<4
}

-// SelectFromPair returns the selection of two elements from the two
+// ConcatPermuteScalars returns the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y. When the selectors are constants the selection can be
@@ -754,7 +754,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 {
+func (x Uint64x2) ConcatPermuteScalars(a, b uint8, y Uint64x2) Uint64x2 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -772,7 +772,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
@@ -783,7 +783,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 {
+func (x Uint64x4) ConcatPermuteScalarsGrouped(a, b uint8, y Uint64x4) Uint64x4 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -801,7 +801,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
@@ -812,7 +812,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 {
+func (x Uint64x8) ConcatPermuteScalarsGrouped(a, b uint8, y Uint64x8) Uint64x8 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -830,7 +830,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPair returns the selection of two elements from the two
+// ConcatPermuteScalars returns the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y. When the selectors are constants the selection can be
@@ -840,7 +840,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 {
+func (x Float64x2) ConcatPermuteScalars(a, b uint8, y Float64x2) Float64x2 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -858,7 +858,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
@@ -869,7 +869,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 {
+func (x Float64x4) ConcatPermuteScalarsGrouped(a, b uint8, y Float64x4) Float64x4 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -887,7 +887,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
@@ -898,7 +898,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 {
+func (x Float64x8) ConcatPermuteScalarsGrouped(a, b uint8, y Float64x8) Float64x8 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -916,7 +916,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPair returns the selection of two elements from the two
+// ConcatPermuteScalars returns the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
// of y. When the selectors are constants the selection can be
@@ -926,7 +926,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 {
+func (x Int64x2) ConcatPermuteScalars(a, b uint8, y Int64x2) Int64x2 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -944,7 +944,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
// the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
@@ -955,7 +955,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 {
+func (x Int64x4) ConcatPermuteScalarsGrouped(a, b uint8, y Int64x4) Int64x4 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1
@@ -973,7 +973,7 @@
panic("missing case, switch should be exhaustive")
}

-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
// of the vectors x and y, the selection of two elements from the two
// vectors x and y, where selector values in the range 0-1 specify
// elements from x and values in the range 2-3 specify the 0-1 elements
@@ -984,7 +984,7 @@
// call.
//
// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
+func (x Int64x8) ConcatPermuteScalarsGrouped(a, b uint8, y Int64x8) Int64x8 {
pattern := (a&2)>>1 + (b & 2)

a, b = a&1, b&1

Change information

Files:
  • M src/cmd/compile/internal/amd64/simdssa.go
  • M src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
  • M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
  • M src/cmd/compile/internal/ssa/opGen.go
  • M src/cmd/compile/internal/ssa/rewriteAMD64.go
  • M src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
  • M src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
  • M src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
  • M src/simd/archsimd/internal/simd_test/simd_test.go
  • M src/simd/archsimd/internal/simd_test/transpose_test.go
  • M src/simd/archsimd/ops_amd64.go
  • M src/simd/archsimd/shuffles_amd64.go
Change size: L
Delta: 12 files changed, 451 insertions(+), 451 deletions(-)
Open in Gerrit

Related details

Attention set is empty
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: newchange
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: Ida086183afad589e6c457c7f5fff508a71a5f2dd
Gerrit-Change-Number: 777020
Gerrit-PatchSet: 1
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
unsatisfied_requirement
satisfied_requirement
open
diffy

Junyang Shao (Gerrit)

unread,
5:11 PM (6 hours ago) 5:11 PM
to goph...@pubsubhelper.golang.org, David Chase, golang-co...@googlegroups.com
Attention needed from David Chase

Junyang Shao voted Commit-Queue+1

Commit-Queue+1
Open in Gerrit

Related details

Attention is currently required from:
  • David Chase
Submit Requirements:
  • requirement is not satisfiedCode-Review
  • requirement satisfiedNo-Unresolved-Comments
  • requirement is not satisfiedReview-Enforcement
  • requirement is not satisfiedTryBots-Pass
Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. DiffyGerrit
Gerrit-MessageType: comment
Gerrit-Project: go
Gerrit-Branch: dev.simd
Gerrit-Change-Id: Ida086183afad589e6c457c7f5fff508a71a5f2dd
Gerrit-Change-Number: 777020
Gerrit-PatchSet: 2
Gerrit-Owner: Junyang Shao <shaoj...@google.com>
Gerrit-Reviewer: David Chase <drc...@google.com>
Gerrit-Reviewer: Junyang Shao <shaoj...@google.com>
Gerrit-Attention: David Chase <drc...@google.com>
Gerrit-Comment-Date: Mon, 11 May 2026 21:11:21 +0000
Gerrit-HasComments: No
Gerrit-Has-Labels: Yes
unsatisfied_requirement
satisfied_requirement
open
diffy
Reply all
Reply to author
Forward
0 new messages