[dev.simd] cmd/internal/obj/arm64: add scalar pairwise and reduction operand patterns
Add missing operand patterns for two categories of ASIMD instructions:
Across-vector reductions (VFMAXV, VFMAXNMV, VFMINV, VFMINNMV):
The existing optab entry for AVADDV uses C_VREG as the destination,
but F-register syntax (e.g. "VFMAXNMV V14.S4, F15") parses as C_FREG.
Add a duplicate optab entry with C_FREG destination. The encoding in
case 85 is identical since F and V registers share the same hardware
register file.
Scalar pairwise (VFADDP, VFMAXP, VFMINP, VFMAXNMP, VFMINNMP):
These are 2-operand AdvSIMD scalar pairwise instructions: Vn.<T> -> Fd.
Add a new optab entry with pattern {C_ARNG, C_NONE, C_NONE, C_FREG}
and encoding case 110 implementing:
0x7e300800 | sz<<22 | opcode<<12 | Rn<<5 | Rd
Uncomment the 9 corresponding encoding tests in arm64enc.s.
diff --git a/src/cmd/asm/internal/asm/testdata/arm64enc.s b/src/cmd/asm/internal/asm/testdata/arm64enc.s
index 5258d29..0876080 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64enc.s
@@ -436,22 +436,22 @@
VFADD V21.D2, V10.D2, V21.D2 // 55d5754e
FADDS F12, F2, F10 // 4a282c1e
FADDD F24, F14, F12 // cc29781e
- //TODO VFADDP V4.D2, F13 // 8dd8707e
+ VFADDP V4.D2, F13 // 8dd8707e
VFADDP V30.S4, V3.S4, V11.S4 // 6bd43e6e
FCCMPS LE, F17, F12, $14 // 8ed5311e
FCCMPD HI, F11, F15, $15 // ef856b1e
FCCMPES HS, F28, F13, $13 // bd253c1e
FCCMPED LT, F20, F4, $9 // 99b4741e
//TODO FCMEQ F7, F11, F26 // 7ae5675e
- //TODO VFCMEQ V29.S4, V26.S4, V30.S4 // 5ee73d4e
+ VFCMEQ V29.S4, V26.S4, V30.S4 // 5ee73d4e
//TODO FCMEQ $0, F17, F22 // 36daa05e
VFCMEQ $0, V17.D2, V22.D2 // 36dae04e
//TODO FCMGE F29, F31, F13 // ede77d7e
- //TODO VFCMGE V8.S2, V31.S2, V2.S2 // e2e7282e
+ VFCMGE V8.S2, V31.S2, V2.S2 // e2e7282e
//TODO FCMGE $0, F18, F27 // e2e7282e
VFCMGE $0, V14.S2, V8.S2 // c8c9a02e
//TODO FCMGT F20, F2, F8 // 48e4b47e
- //TODO VFCMGT V26.D2, V15.D2, V23.D2 // f7e5fa6e
+ VFCMGT V26.D2, V15.D2, V23.D2 // f7e5fa6e
//TODO FCMGT $0, F14, F3 // c3c9e05e
VFCMGT $0, V6.S2, V28.S2 // dcc8a00e
//TODO FCMLE $0, F26, F25 // 59dba07e
@@ -558,24 +558,24 @@
VFMAXNM V3.D2, V12.D2, V27.D2 // 9bc5634e
FMAXNMS F11, F24, F12 // 0c6b2b1e
FMAXNMD F20, F6, F16 // d068741e
- //TODO VFMAXNMP V3.S2, F2 // 62c8307e
+ VFMAXNMP V3.S2, F2 // 62c8307e
VFMAXNMP V25.S2, V4.S2, V2.S2 // 82c4392e
- //TODO VFMAXNMV V14.S4, F15 // cfc9306e
- //TODO VFMAXP V3.S2, F27 // 7bf8307e
+ VFMAXNMV V14.S4, F15 // cfc9306e
+ VFMAXP V3.S2, F27 // 7bf8307e
VFMAXP V29.S2, V30.S2, V9.S2 // c9f73d2e
- //TODO VFMAXV V13.S4, F14 // aef9306e
+ VFMAXV V13.S4, F14 // aef9306e
VFMIN V19.D2, V30.D2, V7.D2 // c7f7f34e
FMINS F26, F18, F30 // 5e5a3a1e
FMIND F29, F4, F21 // 95587d1e
VFMINNM V21.S4, V5.S4, V1.S4 // a1c4b54e
FMINNMS F23, F20, F1 // 817a371e
FMINNMD F8, F3, F24 // 7878681e
- //TODO VFMINNMP V16.D2, F12 // 0ccaf07e
+ VFMINNMP V16.D2, F12 // 0ccaf07e
VFMINNMP V10.S4, V25.S4, V27.S4 // 3bc7aa6e
- //TODO VFMINNMV V8.S4, F3 // 03c9b06e
- //TODO VFMINP V10.S2, F20 // 54f9b07e
+ VFMINNMV V8.S4, F3 // 03c9b06e
+ VFMINP V10.S2, F20 // 54f9b07e
VFMINP V1.D2, V10.D2, V3.D2 // 43f5e16e
- //TODO VFMINV V11.S4, F9 // 69f9b06e
+ VFMINV V11.S4, F9 // 69f9b06e
//TODO VFMLA V6.S[0], F2, F14 // 4e10865f
//TODO VFMLA V28.S[2], V2.S2, V30.S2 // 5e189c0f
VFMLA V29.S2, V20.S2, V14.S2 // 8ece3d0e
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 2a398ff..4dc93bd 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -388,6 +388,12 @@
{AVADD, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 89, 4, 0, 0, 0},
{AVADD, C_VREG, C_NONE, C_NONE, C_VREG, C_NONE, 89, 4, 0, 0, 0},
{AVADDV, C_ARNG, C_NONE, C_NONE, C_VREG, C_NONE, 85, 4, 0, 0, 0},
+ {AVADDV, C_ARNG, C_NONE, C_NONE, C_FREG, C_NONE, 85, 4, 0, 0, 0},
+
+ /* scalar pairwise reductions: vfaddp/vfmaxp/vfminp/vfmaxnmp/vfminnmp Vn.<T>, Fd
+ These use AVFMLA as the root opcode so they are included in the AVFMLA oprange,
+ which covers AVFADDP, AVFMAXP, AVFMINP, AVFMAXNMP, AVFMINNMP via oprangeset. */
+ {AVFMLA, C_ARNG, C_NONE, C_NONE, C_FREG, C_NONE, 110, 4, 0, 0, 0},
/* logical operations */
{AAND, C_ZREG, C_ZREG, C_NONE, C_ZREG, C_NONE, 1, 4, 0, 0, 0},
@@ -3300,6 +3306,9 @@
oprangeset(AVFMAXNMP, t)
oprangeset(AVFMINNMP, t)
+ case AVFADDP, AVFMAXP, AVFMINP, AVFMAXNMP, AVFMINNMP:
+ // scalar pairwise forms handled by case 110; no additional oprangesets needed
+
case AVPMULL:
oprangeset(AVPMULL2, t)
@@ -6032,6 +6041,42 @@
rd := uint32(p.To.Reg & 31)
rn := uint32(p.Reg & 31)
o1 |= Q<<30 | size<<22 | (rn << 5) | (rd)
+
+ case 110: /* scalar pairwise reduction: vfaddp/vfmaxp/vfminp/vfmaxnmp/vfminnmp Vn.<T>, Fd */
+ // AdvSIMD scalar pairwise: 0 1 U 1 1110 sz 1 1000 opcode 10 Rn Rd
+ // Base encoding: 0x7e300800
+ // sz[0] = 1 for D2 arrangement, 0 for S2
+ // sz[1] = 1 for min variants (VFMINNMP, VFMINP), 0 for max/add variants
+ af := int((p.From.Reg >> 5) & 15)
+ var sz, opcode uint32
+ switch af {
+ case ARNG_2S:
+ sz = 0
+ case ARNG_2D:
+ sz = 1
+ default:
+ c.ctxt.Diag("invalid arrangement: %v\n", p)
+ }
+ switch p.As {
+ case AVFADDP:
+ opcode = 0x0d
+ case AVFMAXP, AVFMAXNMP:
+ opcode = 0x0f
+ if p.As == AVFMAXNMP {
+ opcode = 0x0c
+ }
+ case AVFMINP, AVFMINNMP:
+ sz |= 2 // set sz[1] for min variants
+ opcode = 0x0f
+ if p.As == AVFMINNMP {
+ opcode = 0x0c
+ }
+ default:
+ c.ctxt.Diag("unsupported op %v\n", p.As)
+ }
+ rn := uint32(p.From.Reg & 31)
+ rd := uint32(p.To.Reg & 31)
+ o1 = 0x7e300800 | sz<<22 | opcode<<12 | rn<<5 | rd
}
out[0] = o1
out[1] = o2
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
These use AVFMLA as the root opcode so they are included in the AVFMLA oprange,Would it be better to make `AVFADDP` or any other of them a new root and `vfmaxp`/`vfminp`/`vfmaxnmp`/`vfminnmp` its successors? I could do it in my change ( I've just split a large change from CL 747961 into a more reviewable chain in CL 762200 , now the chain contains all changes from there so this should be easily rebase-able to it too ) but this CL may be a better point, because it shows where adding a separate root for scalar pairwise (fp) reductions helps.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
These use AVFMLA as the root opcode so they are included in the AVFMLA oprange,Would it be better to make `AVFADDP` or any other of them a new root and `vfmaxp`/`vfminp`/`vfmaxnmp`/`vfminnmp` its successors? I could do it in my change ( I've just split a large change from CL 747961 into a more reviewable chain in CL 762200 , now the chain contains all changes from there so this should be easily rebase-able to it too ) but this CL may be a better point, because it shows where adding a separate root for scalar pairwise (fp) reductions helps.
Agreed — AVFADDP as the root for scalar pairwise reductions is cleaner. I'll update this CL. Thanks for the pointer to CL 762200--- are you planning to submit work onto the master branch going forward?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
These use AVFMLA as the root opcode so they are included in the AVFMLA oprange,Jonathan SwinneyWould it be better to make `AVFADDP` or any other of them a new root and `vfmaxp`/`vfminp`/`vfmaxnmp`/`vfminnmp` its successors? I could do it in my change ( I've just split a large change from CL 747961 into a more reviewable chain in CL 762200 , now the chain contains all changes from there so this should be easily rebase-able to it too ) but this CL may be a better point, because it shows where adding a separate root for scalar pairwise (fp) reductions helps.
Agreed — AVFADDP as the root for scalar pairwise reductions is cleaner. I'll update this CL. Thanks for the pointer to CL 762200--- are you planning to submit work onto the master branch going forward?
Thanks! The chain in CL 762200 includes only assembler changes, and I'm hoping to eventually replace CL 747961 in this chain by rebasing onto the reviewed / submitted changes after they get from master to dev.simd. For everything else, I plan to continue working on dev.simd on this chain.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
[dev.simd] cmd/internal/obj/arm64: add scalar pairwise and reduction operand patterns
Add missing operand patterns for two categories of ASIMD instructions:
Across-vector reductions (VFMAXV, VFMAXNMV, VFMINV, VFMINNMV):
The existing optab entry for AVADDV uses C_VREG as the destination,
but F-register syntax (e.g. "VFMAXNMV V14.S4, F15") parses as C_FREG.
Add a duplicate optab entry with C_FREG destination. The encoding in
case 85 is identical since F and V registers share the same hardware
register file.
Scalar pairwise (VFADDP, VFMAXP, VFMINP, VFMAXNMP, VFMINNMP):
These are 2-operand AdvSIMD scalar pairwise instructions: Vn.<T> -> Fd.
Add a new optab entry with pattern {C_ARNG, C_NONE, C_NONE, C_FREG}
and encoding case 110 implementing:
0x7e300800 | sz<<22 | opcode<<12 | Rn<<5 | Rd
Make AVFADDP its own oprangeset root for pairwise instructions instead
of sharing AVFMLA's oprangeset. This prevents pairwise instructions
(AVFMAXP, AVFMINP, AVFMAXNMP, AVFMINNMP) from incorrectly accepting
indexed element forms (C_ELEM, class 111), and prevents AVFMLA from
incorrectly accepting the scalar pairwise form (C_ARNG -> C_FREG,
class 110).
AVFADDP now has two optab entries:
- class 72: vector pairwise (C_ARNG, C_ARNG -> C_ARNG)
- class 110: scalar pairwise (C_ARNG -> C_FREG)
AVFMLA retains:
- class 72: 3-register vector (C_ARNG, C_ARNG -> C_ARNG)
- class 111: indexed element (C_ELEM patterns)
Uncomment the 9 corresponding encoding tests in arm64enc.s.
diff --git a/src/cmd/asm/internal/asm/testdata/arm64enc.s b/src/cmd/asm/internal/asm/testdata/arm64enc.s
index c6ce1dd..b74ddfe 100644
index ed5f680..e1bd8e1 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -388,6 +388,13 @@
{AVADD, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 89, 4, 0, 0, 0},
{AVADD, C_VREG, C_NONE, C_NONE, C_VREG, C_NONE, 89, 4, 0, 0, 0},
{AVADDV, C_ARNG, C_NONE, C_NONE, C_VREG, C_NONE, 85, 4, 0, 0, 0},
+ {AVADDV, C_ARNG, C_NONE, C_NONE, C_FREG, C_NONE, 85, 4, 0, 0, 0},
+
+ /* scalar pairwise reductions: vfaddp/vfmaxp/vfminp/vfmaxnmp/vfminnmp Vn.<T>, Fd
+ AVFADDP is the root opcode for pairwise instructions, shared with
+ AVFMAXP, AVFMINP, AVFMAXNMP, AVFMINNMP via oprangeset. */
+ {AVFADDP, C_ARNG, C_ARNG, C_NONE, C_ARNG, C_NONE, 72, 4, 0, 0, 0},
+ {AVFADDP, C_ARNG, C_NONE, C_NONE, C_FREG, C_NONE, 110, 4, 0, 0, 0},
/* logical operations */
{AAND, C_ZREG, C_ZREG, C_NONE, C_ZREG, C_NONE, 1, 4, 0, 0, 0},
@@ -3307,10 +3314,11 @@
oprangeset(AVFDIV, t)
oprangeset(AVFMAX, t)
oprangeset(AVFMAXNM, t)
- oprangeset(AVFMAXP, t)
- oprangeset(AVFADDP, t)
oprangeset(AVFMIN, t)
oprangeset(AVFMINNM, t)
+
+ case AVFADDP:
+ oprangeset(AVFMAXP, t)
oprangeset(AVFMINP, t)
oprangeset(AVFMAXNMP, t)
oprangeset(AVFMINNMP, t)
@@ -6051,6 +6059,42 @@
+
case 127:
// Generic SVE instruction encoding
matched := false
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Jonathan Swinney abandoned this change.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |