[go/dev.simd] [dev.simd] simd, cmd/compile: change Select(128)?FromPair names

1 view

Skip to first unread message

Junyang Shao (Gerrit)

unread,

5:09 PM (6 hours ago) 5:09 PM

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Junyang Shao has uploaded the change for review

Commit message

[dev.simd] simd, cmd/compile: change Select(128)?FromPair names

This CL changes Select(128)?FromPair to ConcatPermute(128)?Scalars

For #78979.

Change-Id: Ida086183afad589e6c457c7f5fff508a71a5f2dd

Change diff

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 402e711..9a0febe 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1211,7 +1211,9 @@
 		ssa.OpAMD64VPSRLQMasked512const:
 		p = simdVkvImm8(s, v)
 
-	case ssa.OpAMD64VPALIGNR128,
+	case ssa.OpAMD64VPERM2F128256,
+		ssa.OpAMD64VPERM2I128256,
+		ssa.OpAMD64VPALIGNR128,
 		ssa.OpAMD64VPALIGNR256,
 		ssa.OpAMD64VPALIGNR512,
 		ssa.OpAMD64VCMPPS128,
@@ -1224,8 +1226,6 @@
 		ssa.OpAMD64VGF2P8AFFINEINVQB128,
 		ssa.OpAMD64VGF2P8AFFINEINVQB256,
 		ssa.OpAMD64VGF2P8AFFINEINVQB512,
-		ssa.OpAMD64VPERM2F128256,
-		ssa.OpAMD64VPERM2I128256,
 		ssa.OpAMD64VINSERTF128256,
 		ssa.OpAMD64VINSERTF64X4512,
 		ssa.OpAMD64VINSERTI128256,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index cdbedbc..fc3461e 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -216,6 +216,16 @@
 (ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
 (ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
 (ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
+(ConcatPermute128ScalarsFloat32x8 ...) => (VPERM2F128256 ...)
+(ConcatPermute128ScalarsFloat64x4 ...) => (VPERM2F128256 ...)
+(ConcatPermute128ScalarsInt8x32 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsInt16x16 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsInt32x8 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsInt64x4 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint8x32 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint16x16 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint32x8 ...) => (VPERM2I128256 ...)
+(ConcatPermute128ScalarsUint64x4 ...) => (VPERM2I128256 ...)
 (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
 (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
 (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@@ -918,16 +928,6 @@
 (ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
 (ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
 (ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
-(Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...)
-(Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...)
-(Select128FromPairInt8x32 ...) => (VPERM2I128256 ...)
-(Select128FromPairInt16x16 ...) => (VPERM2I128256 ...)
-(Select128FromPairInt32x8 ...) => (VPERM2I128256 ...)
-(Select128FromPairInt64x4 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint8x32 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint16x16 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint32x8 ...) => (VPERM2I128256 ...)
-(Select128FromPairUint64x4 ...) => (VPERM2I128256 ...)
 (SetElemFloat32x4 ...) => (VPINSRD128 ...)
 (SetElemFloat64x2 ...) => (VPINSRQ128 ...)
 (SetElemInt8x16 ...) => (VPINSRB128 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 3644bbc..c9f7929 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1183,6 +1183,16 @@
 		{name: "CeilScaledResidueFloat64x2", argLength: 1, aux: "UInt8"},                // ARCH:amd64
 		{name: "CeilScaledResidueFloat64x4", argLength: 1, aux: "UInt8"},                // ARCH:amd64
 		{name: "CeilScaledResidueFloat64x8", argLength: 1, aux: "UInt8"},                // ARCH:amd64
+		{name: "ConcatPermute128ScalarsFloat32x8", argLength: 2, aux: "UInt8"},          // ARCH:amd64
+		{name: "ConcatPermute128ScalarsFloat64x4", argLength: 2, aux: "UInt8"},          // ARCH:amd64
+		{name: "ConcatPermute128ScalarsInt8x32", argLength: 2, aux: "UInt8"},            // ARCH:amd64
+		{name: "ConcatPermute128ScalarsInt16x16", argLength: 2, aux: "UInt8"},           // ARCH:amd64
+		{name: "ConcatPermute128ScalarsInt32x8", argLength: 2, aux: "UInt8"},            // ARCH:amd64
+		{name: "ConcatPermute128ScalarsInt64x4", argLength: 2, aux: "UInt8"},            // ARCH:amd64
+		{name: "ConcatPermute128ScalarsUint8x32", argLength: 2, aux: "UInt8"},           // ARCH:amd64
+		{name: "ConcatPermute128ScalarsUint16x16", argLength: 2, aux: "UInt8"},          // ARCH:amd64
+		{name: "ConcatPermute128ScalarsUint32x8", argLength: 2, aux: "UInt8"},           // ARCH:amd64
+		{name: "ConcatPermute128ScalarsUint64x4", argLength: 2, aux: "UInt8"},           // ARCH:amd64
 		{name: "ConcatShiftBytesRightGroupedUint8x32", argLength: 2, aux: "UInt8"},      // ARCH:amd64
 		{name: "ConcatShiftBytesRightGroupedUint8x64", argLength: 2, aux: "UInt8"},      // ARCH:amd64
 		{name: "ConcatShiftBytesRightUint8x16", argLength: 2, aux: "UInt8"},             // ARCH:amd64
@@ -1251,16 +1261,6 @@
 		{name: "RoundScaledResidueFloat64x4", argLength: 1, aux: "UInt8"},               // ARCH:amd64
 		{name: "RoundScaledResidueFloat64x8", argLength: 1, aux: "UInt8"},               // ARCH:amd64
 		{name: "SHA1FourRoundsUint32x4", argLength: 2, aux: "UInt8"},                    // ARCH:amd64
-		{name: "Select128FromPairFloat32x8", argLength: 2, aux: "UInt8"},                // ARCH:amd64
-		{name: "Select128FromPairFloat64x4", argLength: 2, aux: "UInt8"},                // ARCH:amd64
-		{name: "Select128FromPairInt8x32", argLength: 2, aux: "UInt8"},                  // ARCH:amd64
-		{name: "Select128FromPairInt16x16", argLength: 2, aux: "UInt8"},                 // ARCH:amd64
-		{name: "Select128FromPairInt32x8", argLength: 2, aux: "UInt8"},                  // ARCH:amd64
-		{name: "Select128FromPairInt64x4", argLength: 2, aux: "UInt8"},                  // ARCH:amd64
-		{name: "Select128FromPairUint8x32", argLength: 2, aux: "UInt8"},                 // ARCH:amd64
-		{name: "Select128FromPairUint16x16", argLength: 2, aux: "UInt8"},                // ARCH:amd64
-		{name: "Select128FromPairUint32x8", argLength: 2, aux: "UInt8"},                 // ARCH:amd64
-		{name: "Select128FromPairUint64x4", argLength: 2, aux: "UInt8"},                 // ARCH:amd64
 		{name: "SetElemFloat32x4", argLength: 2, aux: "UInt8"},                          // ARCH:amd64
 		{name: "SetElemFloat64x2", argLength: 2, aux: "UInt8"},                          // ARCH:amd64
 		{name: "SetElemInt8x16", argLength: 2, aux: "UInt8"},                            // ARCH:amd64
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 52a6b39..2d375d4 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -7364,6 +7364,16 @@
 	OpCeilScaledResidueFloat64x2
 	OpCeilScaledResidueFloat64x4
 	OpCeilScaledResidueFloat64x8
+	OpConcatPermute128ScalarsFloat32x8
+	OpConcatPermute128ScalarsFloat64x4
+	OpConcatPermute128ScalarsInt8x32
+	OpConcatPermute128ScalarsInt16x16
+	OpConcatPermute128ScalarsInt32x8
+	OpConcatPermute128ScalarsInt64x4
+	OpConcatPermute128ScalarsUint8x32
+	OpConcatPermute128ScalarsUint16x16
+	OpConcatPermute128ScalarsUint32x8
+	OpConcatPermute128ScalarsUint64x4
 	OpConcatShiftBytesRightGroupedUint8x32
 	OpConcatShiftBytesRightGroupedUint8x64
 	OpConcatShiftBytesRightUint8x16
@@ -7432,16 +7442,6 @@
 	OpRoundScaledResidueFloat64x4
 	OpRoundScaledResidueFloat64x8
 	OpSHA1FourRoundsUint32x4
-	OpSelect128FromPairFloat32x8
-	OpSelect128FromPairFloat64x4
-	OpSelect128FromPairInt8x32
-	OpSelect128FromPairInt16x16
-	OpSelect128FromPairInt32x8
-	OpSelect128FromPairInt64x4
-	OpSelect128FromPairUint8x32
-	OpSelect128FromPairUint16x16
-	OpSelect128FromPairUint32x8
-	OpSelect128FromPairUint64x4
 	OpSetElemFloat32x4
 	OpSetElemFloat64x2
 	OpSetElemInt8x16
@@ -95431,6 +95431,66 @@
 		generic: true,
 	},
 	{
+		name:    "ConcatPermute128ScalarsFloat32x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsFloat64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsInt8x32",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsInt16x16",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsInt32x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsInt64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsUint8x32",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsUint16x16",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsUint32x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermute128ScalarsUint64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
 		name:    "ConcatShiftBytesRightGroupedUint8x32",
 		auxType: auxUInt8,
 		argLen:  2,
@@ -95839,66 +95899,6 @@
 		generic: true,
 	},
 	{
-		name:    "Select128FromPairFloat32x8",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairFloat64x4",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairInt8x32",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairInt16x16",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairInt32x8",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairInt64x4",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairUint8x32",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairUint16x16",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairUint32x8",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "Select128FromPairUint64x4",
-		auxType: auxUInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
 		name:    "SetElemFloat32x4",
 		auxType: auxUInt8,
 		argLen:  2,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 179e492..73533cb 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2674,6 +2674,36 @@
 	case OpConcatAddPairsUint32x4:
 		v.Op = OpAMD64VPHADDD128
 		return true
+	case OpConcatPermute128ScalarsFloat32x8:
+		v.Op = OpAMD64VPERM2F128256
+		return true
+	case OpConcatPermute128ScalarsFloat64x4:
+		v.Op = OpAMD64VPERM2F128256
+		return true
+	case OpConcatPermute128ScalarsInt16x16:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsInt32x8:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsInt64x4:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsInt8x32:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsUint16x16:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsUint32x8:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsUint64x4:
+		v.Op = OpAMD64VPERM2I128256
+		return true
+	case OpConcatPermute128ScalarsUint8x32:
+		v.Op = OpAMD64VPERM2I128256
+		return true
 	case OpConcatPermuteFloat32x16:
 		v.Op = OpAMD64VPERMI2PS512
 		return true
@@ -5190,36 +5220,6 @@
 		return rewriteValueAMD64_OpSelect0(v)
 	case OpSelect1:
 		return rewriteValueAMD64_OpSelect1(v)
-	case OpSelect128FromPairFloat32x8:
-		v.Op = OpAMD64VPERM2F128256
-		return true
-	case OpSelect128FromPairFloat64x4:
-		v.Op = OpAMD64VPERM2F128256
-		return true
-	case OpSelect128FromPairInt16x16:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairInt32x8:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairInt64x4:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairInt8x32:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairUint16x16:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairUint32x8:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairUint64x4:
-		v.Op = OpAMD64VPERM2I128256
-		return true
-	case OpSelect128FromPairUint8x32:
-		v.Op = OpAMD64VPERM2I128256
-		return true
 	case OpSelectN:
 		return rewriteValueAMD64_OpSelectN(v)
 	case OpSetElemFloat32x4:
diff --git a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
index 5c94e17..7917b19 100644
--- a/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
@@ -226,6 +226,16 @@
 	addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x8.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsFloat32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Float64x4.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsFloat64x4, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int8x32.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt8x32, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int32x8.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int64x4.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsInt64x4, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint8x32.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint8x32, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x16.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x8.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint64x4.ConcatPermute128Scalars", opLen2Imm8_II(ssa.OpConcatPermute128ScalarsUint64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8_2I(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@@ -928,16 +938,6 @@
 	addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt8x32, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint8x32, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
index 70c8178..43a0743 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -255,7 +255,7 @@
     // This differs from the same method applied to a 32x8 or 32x16 vector, where
     // the 8-bit constant performs the same selection on all the subvectors.
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   commutative: false
   documentation: !string |-
     // NAME treats the 256-bit vectors x and y as a single vector of four
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
index 927f88c..ae64e4e 100644
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -880,7 +880,7 @@
   out:
   - *v
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   asm: VPERM2F128
   operandOrder: II
   addDoc: !string |-
@@ -903,7 +903,7 @@
   out:
   - *v
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   asm: VPERM2F128
   operandOrder: II
   addDoc: !string |-
@@ -927,7 +927,7 @@
   out:
   - *v
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   asm: VPERM2I128
   operandOrder: II
   addDoc: !string |-
@@ -951,7 +951,7 @@
   out:
   - *v
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   asm: VPERM2I128
   operandOrder: II
   addDoc: !string |-
@@ -975,7 +975,7 @@
   out:
   - *v
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   asm: VPERM2I128
   operandOrder: II
   addDoc: !string |-
@@ -1000,7 +1000,7 @@
   out:
   - *v
 
-- go: Select128FromPair
+- go: ConcatPermute128Scalars
   asm: VPERM2I128
   operandOrder: II
   addDoc: !string |-
diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go
index 224d0bf..d3148dc 100644
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -666,25 +666,25 @@
 	x := archsimd.LoadInt32x4([]int32{0, 1, 2, 3})
 	y := archsimd.LoadInt32x4([]int32{4, 5, 6, 7})
 
-	llll := x.SelectFromPair(0, 1, 2, 3, y)
-	hhhh := x.SelectFromPair(4, 5, 6, 7, y)
-	llhh := x.SelectFromPair(0, 1, 6, 7, y)
-	hhll := x.SelectFromPair(6, 7, 0, 1, y)
+	llll := x.ConcatPermuteScalars(0, 1, 2, 3, y)
+	hhhh := x.ConcatPermuteScalars(4, 5, 6, 7, y)
+	llhh := x.ConcatPermuteScalars(0, 1, 6, 7, y)
+	hhll := x.ConcatPermuteScalars(6, 7, 0, 1, y)
 
-	lllh := x.SelectFromPair(0, 1, 2, 7, y)
-	llhl := x.SelectFromPair(0, 1, 7, 2, y)
-	lhll := x.SelectFromPair(0, 7, 1, 2, y)
-	hlll := x.SelectFromPair(7, 0, 1, 2, y)
+	lllh := x.ConcatPermuteScalars(0, 1, 2, 7, y)
+	llhl := x.ConcatPermuteScalars(0, 1, 7, 2, y)
+	lhll := x.ConcatPermuteScalars(0, 7, 1, 2, y)
+	hlll := x.ConcatPermuteScalars(7, 0, 1, 2, y)
 
-	hhhl := x.SelectFromPair(4, 5, 6, 0, y)
-	hhlh := x.SelectFromPair(4, 5, 0, 6, y)
-	hlhh := x.SelectFromPair(4, 0, 5, 6, y)
-	lhhh := x.SelectFromPair(0, 4, 5, 6, y)
+	hhhl := x.ConcatPermuteScalars(4, 5, 6, 0, y)
+	hhlh := x.ConcatPermuteScalars(4, 5, 0, 6, y)
+	hlhh := x.ConcatPermuteScalars(4, 0, 5, 6, y)
+	lhhh := x.ConcatPermuteScalars(0, 4, 5, 6, y)
 
-	lhlh := x.SelectFromPair(0, 4, 1, 5, y)
-	hlhl := x.SelectFromPair(4, 0, 5, 1, y)
-	lhhl := x.SelectFromPair(0, 4, 5, 1, y)
-	hllh := x.SelectFromPair(4, 0, 1, 5, y)
+	lhlh := x.ConcatPermuteScalars(0, 4, 1, 5, y)
+	hlhl := x.ConcatPermuteScalars(4, 0, 5, 1, y)
+	lhhl := x.ConcatPermuteScalars(0, 4, 5, 1, y)
+	hllh := x.ConcatPermuteScalars(4, 0, 1, 5, y)
 
 	r := make([]int32, 4, 4)
 
@@ -716,7 +716,7 @@
 
 //go:noinline
 func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
-	return x.SelectFromPair(a, b, c, d, y)
+	return x.ConcatPermuteScalars(a, b, c, d, y)
 }
 
 func TestSelect4FromPairVar(t *testing.T) {
@@ -775,25 +775,25 @@
 	x := archsimd.LoadFloat32x8([]float32{0, 1, 2, 3, 10, 11, 12, 13})
 	y := archsimd.LoadFloat32x8([]float32{4, 5, 6, 7, 14, 15, 16, 17})
 
-	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
-	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
-	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
-	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+	llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
+	hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
+	llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
+	hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)
 
-	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
-	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
-	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
-	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+	lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
+	llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
+	lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
+	hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)
 
-	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
-	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
-	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
-	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+	hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
+	hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
+	hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
+	lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)
 
-	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
-	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
-	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
-	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+	lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
+	hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
+	lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
+	hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)
 
 	r := make([]float32, 8, 8)
 
@@ -823,7 +823,7 @@
 	foo(hllh, 4, 0, 1, 5)
 }
 
-func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
+func TestConcatPermuteScalarsConstGroupedUint32x16(t *testing.T) {
 	if !archsimd.X86.AVX512() {
 		t.Skip("Test requires X86.AVX512, not available on this hardware")
 		return
@@ -831,25 +831,25 @@
 	x := archsimd.LoadUint32x16([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
 	y := archsimd.LoadUint32x16([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
 
-	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
-	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
-	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
-	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+	llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
+	hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
+	llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
+	hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)
 
-	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
-	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
-	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
-	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+	lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
+	llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
+	lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
+	hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)
 
-	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
-	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
-	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
-	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+	hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
+	hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
+	hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
+	lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)
 
-	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
-	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
-	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
-	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+	lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
+	hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
+	lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
+	hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)
 
 	r := make([]uint32, 16, 16)
 
@@ -883,16 +883,16 @@
 	foo(hllh, 4, 0, 1, 5)
 }
 
-func TestSelect128FromPair(t *testing.T) {
+func TestConcatPermute128Scalars(t *testing.T) {
 	x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
 	y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
 
-	aa := x.Select128FromPair(0, 0, y)
-	ab := x.Select128FromPair(0, 1, y)
-	bc := x.Select128FromPair(1, 2, y)
-	cd := x.Select128FromPair(2, 3, y)
-	da := x.Select128FromPair(3, 0, y)
-	dc := x.Select128FromPair(3, 2, y)
+	aa := x.ConcatPermute128Scalars(0, 0, y)
+	ab := x.ConcatPermute128Scalars(0, 1, y)
+	bc := x.ConcatPermute128Scalars(1, 2, y)
+	cd := x.ConcatPermute128Scalars(2, 3, y)
+	da := x.ConcatPermute128Scalars(3, 0, y)
+	dc := x.ConcatPermute128Scalars(3, 2, y)
 
 	r := make([]uint64, 4, 4)
 
@@ -910,7 +910,7 @@
 	foo(dc, 3, 2)
 }
 
-func TestSelect128FromPairError(t *testing.T) {
+func TestConcatPermute128ScalarsError(t *testing.T) {
 	x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
 	y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
 
@@ -919,17 +919,17 @@
 			t.Logf("Saw expected panic %v", r)
 		}
 	}()
-	_ = x.Select128FromPair(0, 4, y)
+	_ = x.ConcatPermute128Scalars(0, 4, y)
 
 	t.Errorf("Should have panicked")
 }
 
 //go:noinline
 func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
-	return x.Select128FromPair(lo, hi, y)
+	return x.ConcatPermute128Scalars(lo, hi, y)
 }
 
-func TestSelect128FromPairVar(t *testing.T) {
+func TestConcatPermute128ScalarsVar(t *testing.T) {
 	x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
 	y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
 
@@ -960,10 +960,10 @@
 	x := archsimd.LoadUint64x2([]uint64{0, 1})
 	y := archsimd.LoadUint64x2([]uint64{2, 3})
 
-	ll := x.SelectFromPair(0, 1, y)
-	hh := x.SelectFromPair(3, 2, y)
-	lh := x.SelectFromPair(0, 3, y)
-	hl := x.SelectFromPair(2, 1, y)
+	ll := x.ConcatPermuteScalars(0, 1, y)
+	hh := x.ConcatPermuteScalars(3, 2, y)
+	lh := x.ConcatPermuteScalars(0, 3, y)
+	hl := x.ConcatPermuteScalars(2, 1, y)
 
 	r := make([]uint64, 2, 2)
 
@@ -982,10 +982,10 @@
 	x := archsimd.LoadUint64x4([]uint64{0, 1, 10, 11})
 	y := archsimd.LoadUint64x4([]uint64{2, 3, 12, 13})
 
-	ll := x.SelectFromPairGrouped(0, 1, y)
-	hh := x.SelectFromPairGrouped(3, 2, y)
-	lh := x.SelectFromPairGrouped(0, 3, y)
-	hl := x.SelectFromPairGrouped(2, 1, y)
+	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
 
 	r := make([]uint64, 4, 4)
 
@@ -1004,10 +1004,10 @@
 	x := archsimd.LoadFloat64x4([]float64{0, 1, 10, 11})
 	y := archsimd.LoadFloat64x4([]float64{2, 3, 12, 13})
 
-	ll := x.SelectFromPairGrouped(0, 1, y)
-	hh := x.SelectFromPairGrouped(3, 2, y)
-	lh := x.SelectFromPairGrouped(0, 3, y)
-	hl := x.SelectFromPairGrouped(2, 1, y)
+	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
 
 	r := make([]float64, 4, 4)
 
@@ -1026,10 +1026,10 @@
 	x := archsimd.LoadInt64x4([]int64{0, 1, 10, 11})
 	y := archsimd.LoadInt64x4([]int64{2, 3, 12, 13})
 
-	ll := x.SelectFromPairGrouped(0, 1, y)
-	hh := x.SelectFromPairGrouped(3, 2, y)
-	lh := x.SelectFromPairGrouped(0, 3, y)
-	hl := x.SelectFromPairGrouped(2, 1, y)
+	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
 
 	r := make([]int64, 4, 4)
 
@@ -1053,10 +1053,10 @@
 	x := archsimd.LoadInt64x8([]int64{0, 1, 10, 11, 20, 21, 30, 31})
 	y := archsimd.LoadInt64x8([]int64{2, 3, 12, 13, 22, 23, 32, 33})
 
-	ll := x.SelectFromPairGrouped(0, 1, y)
-	hh := x.SelectFromPairGrouped(3, 2, y)
-	lh := x.SelectFromPairGrouped(0, 3, y)
-	hl := x.SelectFromPairGrouped(2, 1, y)
+	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
+	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
+	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
+	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
 
 	r := make([]int64, 8, 8)
 
diff --git a/src/simd/archsimd/internal/simd_test/transpose_test.go b/src/simd/archsimd/internal/simd_test/transpose_test.go
index abd0706..faa87a7 100644
--- a/src/simd/archsimd/internal/simd_test/transpose_test.go
+++ b/src/simd/archsimd/internal/simd_test/transpose_test.go
@@ -27,10 +27,10 @@
 	// C3G7
 	// D4H8
 
-	b0 = t0.SelectFromPair(0, 1, 4, 5, t2) // lower elements from each
-	b1 = t0.SelectFromPair(2, 3, 6, 7, t2) // upper elements from each
-	b2 = t1.SelectFromPair(0, 1, 4, 5, t3) // lowers
-	b3 = t1.SelectFromPair(2, 3, 6, 7, t3) // uppers
+	b0 = t0.ConcatPermuteScalars(0, 1, 4, 5, t2) // lower elements from each
+	b1 = t0.ConcatPermuteScalars(2, 3, 6, 7, t2) // upper elements from each
+	b2 = t1.ConcatPermuteScalars(0, 1, 4, 5, t3) // lowers
+	b3 = t1.ConcatPermuteScalars(2, 3, 6, 7, t3) // uppers
 	return
 }
 
@@ -51,29 +51,29 @@
 	// C3G7
 	// D4H8
 
-	a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2) // lower elements from each
-	a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2) // upper elements from each
-	a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3) // lowers
-	a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3) // uppers
+	a0 = t0.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t2) // lower elements from each
+	a1 = t0.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t2) // upper elements from each
+	a2 = t1.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t3) // lowers
+	a3 = t1.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t3) // uppers
 
-	a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6) // lower elements from each
-	a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6) // upper elements from each
-	a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7) // lowers
-	a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7) // uppers
+	a4 = t4.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t6) // lower elements from each
+	a5 = t4.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t6) // upper elements from each
+	a6 = t5.ConcatPermuteScalarsGrouped(0, 1, 4, 5, t7) // lowers
+	a7 = t5.ConcatPermuteScalarsGrouped(2, 3, 6, 7, t7) // uppers
 
 	// next need to swap the upper 128 bits of a0-a3 with the lower 128 bits of a4-a7
 
-	b0 = a0.Select128FromPair(0, 2, a4)
-	b4 = a0.Select128FromPair(1, 3, a4)
+	b0 = a0.ConcatPermute128Scalars(0, 2, a4)
+	b4 = a0.ConcatPermute128Scalars(1, 3, a4)
 
-	b1 = a1.Select128FromPair(0, 2, a5)
-	b5 = a1.Select128FromPair(1, 3, a5)
+	b1 = a1.ConcatPermute128Scalars(0, 2, a5)
+	b5 = a1.ConcatPermute128Scalars(1, 3, a5)
 
-	b2 = a2.Select128FromPair(0, 2, a6)
-	b6 = a2.Select128FromPair(1, 3, a6)
+	b2 = a2.ConcatPermute128Scalars(0, 2, a6)
+	b6 = a2.ConcatPermute128Scalars(1, 3, a6)
 
-	b3 = a3.Select128FromPair(0, 2, a7)
-	b7 = a3.Select128FromPair(1, 3, a7)
+	b3 = a3.ConcatPermute128Scalars(0, 2, a7)
+	b7 = a3.ConcatPermute128Scalars(1, 3, a7)
 
 	return
 }
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go
index 5cfe5fb..8a720e0 100644
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -1399,6 +1399,172 @@
 // Asm: VPERMI2Q, CPU Feature: AVX512
 func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
 
+/* ConcatPermute128Scalars */
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 42, 43, 50, 51, 52, 53}.ConcatPermute128Scalars(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float32x8) ConcatPermute128Scalars(lo, hi uint8, y Float32x8) Float32x8
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 50, 51}.ConcatPermute128Scalars(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float64x4) ConcatPermute128Scalars(lo, hi uint8, y Float64x4) Float64x4
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.ConcatPermute128Scalars(3, 0,
+//	     {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int8x32) ConcatPermute128Scalars(lo, hi uint8, y Int8x32) Int8x32
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.ConcatPermute128Scalars(3, 0,
+//	 {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int16x16) ConcatPermute128Scalars(lo, hi uint8, y Int16x16) Int16x16
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 42, 43, 50, 51, 52, 53}.ConcatPermute128Scalars(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int32x8) ConcatPermute128Scalars(lo, hi uint8, y Int32x8) Int32x8
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 50, 51}.ConcatPermute128Scalars(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int64x4) ConcatPermute128Scalars(lo, hi uint8, y Int64x4) Int64x4
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.ConcatPermute128Scalars(3, 0,
+//	     {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint8x32) ConcatPermute128Scalars(lo, hi uint8, y Uint8x32) Uint8x32
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.ConcatPermute128Scalars(3, 0,
+//	 {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint16x16) ConcatPermute128Scalars(lo, hi uint8, y Uint16x16) Uint16x16
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 42, 43, 50, 51, 52, 53}.ConcatPermute128Scalars(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint32x8) ConcatPermute128Scalars(lo, hi uint8, y Uint32x8) Uint32x8
+
+// ConcatPermute128Scalars treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//	{40, 41, 50, 51}.ConcatPermute128Scalars(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// A non-constant value of lo, hi may result in significantly worse performance for this operation.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint64x4) ConcatPermute128Scalars(lo, hi uint8, y Uint64x4) Uint64x4
+
 /* ConcatShiftBytesRight */
 
 // ConcatShiftBytesRight concatenates x and y and shifts it right by shift bytes.
@@ -5531,172 +5697,6 @@
 // Asm: VSCALEFPD, CPU Feature: AVX512
 func (x Float64x8) Scale(y Float64x8) Float64x8
 
-/* Select128FromPair */
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2F128, CPU Feature: AVX
-func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2F128, CPU Feature: AVX
-func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
-//	     {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
-//
-// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int8x32) Select128FromPair(lo, hi uint8, y Int8x32) Int8x32
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
-//	 {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
-//
-// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int16x16) Select128FromPair(lo, hi uint8, y Int16x16) Int16x16
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
-//	     {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
-//
-// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint8x32) Select128FromPair(lo, hi uint8, y Uint8x32) Uint8x32
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
-//	 {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
-//
-// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint16x16) Select128FromPair(lo, hi uint8, y Uint16x16) Uint16x16
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-//	{40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// A non-constant value of lo, hi may result in significantly worse performance for this operation.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
-
 /* SetElem */
 
 // SetElem returns x with the index'th element set to y.
diff --git a/src/simd/archsimd/shuffles_amd64.go b/src/simd/archsimd/shuffles_amd64.go
index 1ca6a15..d4e8d09 100644
--- a/src/simd/archsimd/shuffles_amd64.go
+++ b/src/simd/archsimd/shuffles_amd64.go
@@ -7,7 +7,7 @@
 package archsimd
 
 // These constants represent the source pattern for the four parameters
-// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
+// (a, b, c, d) passed to ConcatPermuteScalars and ConcatPermuteScalarsGrouped.
 // L means the element comes from the 'x' vector (Low), and
 // H means it comes from the 'y' vector (High).
 // The order of the letters corresponds to elements a, b, c, d.
@@ -37,7 +37,7 @@
 )
 
 // These constants represent the source pattern for the four parameters
-// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
+// (a, b, c, d) passed to ConcatPermuteScalars and ConcatPermuteScalarsGrouped for
 // two-element vectors.
 const (
 	_LL = iota
@@ -46,7 +46,7 @@
 	_HH
 )
 
-// SelectFromPair returns the selection of four elements from the two
+// ConcatPermuteScalars returns the selection of four elements from the two
 // vectors x and y, where selector values in the range 0-3 specify
 // elements from x and values in the range 4-7 specify the 0-3 elements
 // of y.  When the selectors are constants and the selection can be
@@ -55,7 +55,7 @@
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
 //
-//	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+//	{1,2,4,8}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81})
 //
 // returns {4,8,25,81}.
 //
@@ -63,7 +63,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
+func (x Int32x4) ConcatPermuteScalars(a, b, c, d uint8, y Int32x4) Int32x4 {
 	// pattern gets the concatenation of "x or y?" bits
 	// (0 == x, 1 == y)
 	// This will determine operand choice/order and whether a second
@@ -128,7 +128,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPair returns the selection of four elements from the two
+// ConcatPermuteScalars returns the selection of four elements from the two
 // vectors x and y, where selector values in the range 0-3 specify
 // elements from x and values in the range 4-7 specify the 0-3 elements
 // of y.  When the selectors are constants and can be the selection
@@ -137,7 +137,7 @@
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
 //
-//	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+//	{1,2,4,8}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81})
 //
 // returns {4,8,25,81}.
 //
@@ -145,7 +145,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
+func (x Uint32x4) ConcatPermuteScalars(a, b, c, d uint8, y Uint32x4) Uint32x4 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -203,7 +203,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPair returns the selection of four elements from the two
+// ConcatPermuteScalars returns the selection of four elements from the two
 // vectors x and y, where selector values in the range 0-3 specify
 // elements from x and values in the range 4-7 specify the 0-3 elements
 // of y.  When the selectors are constants and can be the selection
@@ -212,7 +212,7 @@
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
 //
-//	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
+//	{1,2,4,8}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81})
 //
 // returns {4,8,25,81}.
 //
@@ -220,7 +220,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
+func (x Float32x4) ConcatPermuteScalars(a, b, c, d uint8, y Float32x4) Float32x4 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -278,7 +278,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
 // the vectors x and y, the selection of four elements from  x and y,
 // where selector values in the range 0-3 specify elements from x and
 // values in the range 4-7 specify the 0-3 elements of y.
@@ -288,7 +288,7 @@
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
 //
-//	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//	{1,2,4,8,16,32,64,128}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81,121,169,225,289})
 //
 // returns {4,8,25,81,64,128,169,289}.
 //
@@ -296,7 +296,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
+func (x Int32x8) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -354,7 +354,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
 // the vectors x and y, the selection of four elements from  x and y,
 // where selector values in the range 0-3 specify elements from x and
 // values in the range 4-7 specify the 0-3 elements of y.
@@ -364,7 +364,7 @@
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
 //
-//	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//	{1,2,4,8,16,32,64,128}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81,121,169,225,289})
 //
 // returns {4,8,25,81,64,128,169,289}.
 //
@@ -372,7 +372,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
+func (x Uint32x8) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -430,7 +430,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
 // the vectors x and y, the selection of four elements from  x and y,
 // where selector values in the range 0-3 specify elements from x and
 // values in the range 4-7 specify the 0-3 elements of y.
@@ -440,7 +440,7 @@
 // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
 // elements in the output.  For example,
 //
-//	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//	{1,2,4,8,16,32,64,128}.ConcatPermuteScalars(2,3,5,7,{9,25,49,81,121,169,225,289})
 //
 // returns {4,8,25,81,64,128,169,289}.
 //
@@ -448,7 +448,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
+func (x Float32x8) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -506,7 +506,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
 // of the vectors x and y, the selection of four elements from  x and y,
 // where selector values in the range 0-3 specify elements from x and
 // values in the range 4-7 specify the 0-3 elements of y.
@@ -518,7 +518,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX512
-func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
+func (x Int32x16) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -576,7 +576,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
 // of the vectors x and y, the selection of four elements from  x and y,
 // where selector values in the range 0-3 specify elements from x and
 // values in the range 4-7 specify the 0-3 elements of y.
@@ -588,7 +588,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX512
-func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
+func (x Uint32x16) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -646,7 +646,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
 // of the vectors x and y, the selection of four elements from  x and y,
 // where selector values in the range 0-3 specify elements from x and
 // values in the range 4-7 specify the 0-3 elements of y.
@@ -658,7 +658,7 @@
 // call.
 //
 // Asm: VSHUFPS, CPU Feature: AVX512
-func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
+func (x Float32x16) ConcatPermuteScalarsGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
 	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
 
 	a, b, c, d = a&3, b&3, c&3, d&3
@@ -744,7 +744,7 @@
 	return g + g<<4
 }
 
-// SelectFromPair returns the selection of two elements from the two
+// ConcatPermuteScalars returns the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
 // of y.  When the selectors are constants the selection can be
@@ -754,7 +754,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 {
+func (x Uint64x2) ConcatPermuteScalars(a, b uint8, y Uint64x2) Uint64x2 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -772,7 +772,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
 // the vectors x and y, the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
@@ -783,7 +783,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 {
+func (x Uint64x4) ConcatPermuteScalarsGrouped(a, b uint8, y Uint64x4) Uint64x4 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -801,7 +801,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
 // of the vectors x and y, the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
@@ -812,7 +812,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX512
-func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 {
+func (x Uint64x8) ConcatPermuteScalarsGrouped(a, b uint8, y Uint64x8) Uint64x8 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -830,7 +830,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPair returns the selection of two elements from the two
+// ConcatPermuteScalars returns the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
 // of y.  When the selectors are constants the selection can be
@@ -840,7 +840,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 {
+func (x Float64x2) ConcatPermuteScalars(a, b uint8, y Float64x2) Float64x2 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -858,7 +858,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
 // the vectors x and y, the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
@@ -869,7 +869,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 {
+func (x Float64x4) ConcatPermuteScalarsGrouped(a, b uint8, y Float64x4) Float64x4 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -887,7 +887,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
 // of the vectors x and y, the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
@@ -898,7 +898,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX512
-func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 {
+func (x Float64x8) ConcatPermuteScalarsGrouped(a, b uint8, y Float64x8) Float64x8 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -916,7 +916,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPair returns the selection of two elements from the two
+// ConcatPermuteScalars returns the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
 // of y.  When the selectors are constants the selection can be
@@ -926,7 +926,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 {
+func (x Int64x2) ConcatPermuteScalars(a, b uint8, y Int64x2) Int64x2 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -944,7 +944,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// ConcatPermuteScalarsGrouped returns, for each of the two 128-bit halves of
 // the vectors x and y, the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
@@ -955,7 +955,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 {
+func (x Int64x4) ConcatPermuteScalarsGrouped(a, b uint8, y Int64x4) Int64x4 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1
@@ -973,7 +973,7 @@
 	panic("missing case, switch should be exhaustive")
 }
 
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// ConcatPermuteScalarsGrouped returns, for each of the four 128-bit subvectors
 // of the vectors x and y, the selection of two elements from the two
 // vectors x and y, where selector values in the range 0-1 specify
 // elements from x and values in the range 2-3 specify the 0-1 elements
@@ -984,7 +984,7 @@
 // call.
 //
 // Asm: VSHUFPD, CPU Feature: AVX512
-func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
+func (x Int64x8) ConcatPermuteScalarsGrouped(a, b uint8, y Int64x8) Int64x8 {
 	pattern := (a&2)>>1 + (b & 2)
 
 	a, b = a&1, b&1

Change information

Files:

M src/cmd/compile/internal/amd64/simdssa.go
M src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
M src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
M src/cmd/compile/internal/ssa/opGen.go
M src/cmd/compile/internal/ssa/rewriteAMD64.go
M src/cmd/compile/internal/ssagen/simdAMD64intrinsics.go
M src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
M src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
M src/simd/archsimd/internal/simd_test/simd_test.go
M src/simd/archsimd/internal/simd_test/transpose_test.go
M src/simd/archsimd/ops_amd64.go
M src/simd/archsimd/shuffles_amd64.go

Change size: L

Delta: 12 files changed, 451 insertions(+), 451 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Junyang Shao (Gerrit)

unread,

5:11 PM (6 hours ago) 5:11 PM

to goph...@pubsubhelper.golang.org, David Chase, golang-co...@googlegroups.com

Attention needed from David Chase

Junyang Shao voted Commit-Queue+1

Commit-Queue

Open in Gerrit

Related details

Attention is currently required from:

David Chase

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Reply all

Reply to author

Forward

0 new messages