cmd/compile: lower float min/max branch idiom to MINSD/MAXSD/FCSEL
The compiler emits a compare-and-branch for the float min/max idiom
(r := b; if a < b { r = a }) instead of the single hardware instruction
the CPU provides. The idiom "a < b ? a : b" has the same NaN and
signed-zero semantics as MINSD/MINSS (unordered or equal yields the
second operand), so unlike the min builtin it needs no fixup.
branchelim now forms a float CondSelect for this exact shape (strict <,
the compared operands as the select arms, both min and max orderings;
a > b canonicalizes to b < a). amd64 lowers to MINSD/MINSS and new
MAXSD/MAXSS ops; arm64 lowers to FCSELD/FCSELS (FMIND/FMINS can't be
used: they propagate NaN per IEEE, disagreeing with the branch).
riscv64 is left branching as it has no FP conditional-select.
Verified for all NaN, signed-zero and infinity combinations on amd64
and arm64; go build std, cmd/compile/internal/ssa tests and
test/codegen all pass.
Fixes #72831
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 6fa78f5..867d111 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -277,6 +277,7 @@
ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
+ ssa.OpAMD64MAXSS, ssa.OpAMD64MAXSD,
ssa.OpAMD64POR, ssa.OpAMD64PXOR,
ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 44bf01d..e4fdfb5 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -1442,7 +1442,7 @@
simdV11(s, v, arm64.ARNG_8B)
case ssa.OpARM64VUADDLV:
simdV11Scalar(s, v, arm64.ARNG_8B)
- case ssa.OpARM64CSEL, ssa.OpARM64CSEL0:
+ case ssa.OpARM64CSEL, ssa.OpARM64CSEL0, ssa.OpARM64FCSELD, ssa.OpARM64FCSELS:
r1 := int16(arm64.REGZERO)
if v.Op != ssa.OpARM64CSEL0 {
r1 = v.Args[1].Reg()
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
index dff5c93..719a30d 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -395,6 +395,15 @@
(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is16BitInt(t)
=> (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+// A "x < y ? x : y" select matches MINSx exactly (NaN and signed zero
+// included), so emit a single instruction. Less(32|64)F x y lowers to
+// SETGF (UCOMISx y x).
+(CondSelect <t> x y (SETGF (UCOMISD y x))) && is64BitFloat(t) => (MINSD x y)
+(CondSelect <t> x y (SETGF (UCOMISS y x))) && is32BitFloat(t) => (MINSS x y)
+// "x < y ? y : x" is max; the compare operands appear in the other order.
+(CondSelect <t> x y (SETGF (UCOMISD x y))) && is64BitFloat(t) => (MAXSD x y)
+(CondSelect <t> x y (SETGF (UCOMISS x y))) && is32BitFloat(t) => (MAXSS x y)
+
(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && (is64BitInt(t) || isPtr(t))
=> (CMOVQNE y x (CMPQconst [0] check))
(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is32BitInt(t)
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
index a2bbd9d..1f96cfd 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@@ -820,6 +820,8 @@
// special cases the 2nd argument is preferred.
{name: "MINSD", argLength: 2, reg: fp21, resultInArg0: true, asm: "MINSD", earlyOk: true}, // min(arg0,arg1)
{name: "MINSS", argLength: 2, reg: fp21, resultInArg0: true, asm: "MINSS", earlyOk: true}, // min(arg0,arg1)
+ {name: "MAXSD", argLength: 2, reg: fp21, resultInArg0: true, asm: "MAXSD", earlyOk: true}, // max(arg0,arg1)
+ {name: "MAXSS", argLength: 2, reg: fp21, resultInArg0: true, asm: "MAXSS", earlyOk: true}, // max(arg0,arg1)
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ", earlyOk: true}, // (int64)(-1) if carry is set, 0 if carry is clear.
{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL", earlyOk: true}, // (int32)(-1) if carry is set, 0 if carry is clear.
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
index 3c773e1..f06c13b 100644
--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
@@ -298,9 +298,15 @@
(FCMPD x (FMOVDconst [0])) => (FCMPD0 x)
(FCMPD (FMOVDconst [0]) x) => (InvertFlags (FCMPD0 x))
+// Float selects use FCSEL on the FP registers. branchelim only forms these
+// for the "a < b ? a : b" min idiom, whose flag-generating compare is always
+// present, so no TSTW fixup is needed.
+(CondSelect x y boolval) && is64BitFloat(v.Type) && flagArg(boolval) != nil => (FCSELD [boolval.Op] x y flagArg(boolval))
+(CondSelect x y boolval) && is32BitFloat(v.Type) && flagArg(boolval) != nil => (FCSELS [boolval.Op] x y flagArg(boolval))
+
// CSEL needs a flag-generating argument. Synthesize a TSTW if necessary.
-(CondSelect x y boolval) && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval))
-(CondSelect x y boolval) && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval))
+(CondSelect x y boolval) && !v.Type.IsFloat() && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval))
+(CondSelect x y boolval) && !v.Type.IsFloat() && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval))
(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVDaddr [int32(off)] ptr)
(OffPtr [off] ptr) => (ADDconst [off] ptr)
@@ -1351,6 +1357,8 @@
// absorb InvertFlags into conditional instructions
(CSEL [cc] x y (InvertFlags cmp)) => (CSEL [arm64Invert(cc)] x y cmp)
(CSEL0 [cc] x (InvertFlags cmp)) => (CSEL0 [arm64Invert(cc)] x cmp)
+(FCSELD [cc] x y (InvertFlags cmp)) => (FCSELD [arm64Invert(cc)] x y cmp)
+(FCSELS [cc] x y (InvertFlags cmp)) => (FCSELS [arm64Invert(cc)] x y cmp)
(CSETM [cc] (InvertFlags cmp)) => (CSETM [arm64Invert(cc)] cmp)
(CSINC [cc] x y (InvertFlags cmp)) => (CSINC [arm64Invert(cc)] x y cmp)
(CSINV [cc] x y (InvertFlags cmp)) => (CSINV [arm64Invert(cc)] x y cmp)
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
index 04d960f..d81aedf 100644
--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
@@ -521,6 +521,8 @@
{name: "CSINC", argLength: 3, reg: gp2flags1, asm: "CSINC", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : arg1 + 1
{name: "CSINV", argLength: 3, reg: gp2flags1, asm: "CSINV", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : ^arg1
{name: "CSNEG", argLength: 3, reg: gp2flags1, asm: "CSNEG", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : -arg1
+ {name: "FCSELD", argLength: 3, reg: fp21, asm: "FCSELD", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : arg1, 64-bit float
+ {name: "FCSELS", argLength: 3, reg: fp21, asm: "FCSELS", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : arg1, 32-bit float
{name: "CSETM", argLength: 1, reg: readflags, asm: "CSETM", aux: "CCop", earlyOk: true}, // auxint(flags) ? -1 : 0
// conditional comparison instructions; auxint is
diff --git a/src/cmd/compile/internal/ssa/branchelim.go b/src/cmd/compile/internal/ssa/branchelim.go
index 8c411b5..427aa10 100644
--- a/src/cmd/compile/internal/ssa/branchelim.go
+++ b/src/cmd/compile/internal/ssa/branchelim.go
@@ -4,7 +4,10 @@
package ssa
-import "cmd/internal/src"
+import (
+ "cmd/compile/internal/types"
+ "cmd/internal/src"
+)
// branchelim tries to eliminate branches by
// generating CondSelect instructions.
@@ -107,6 +110,30 @@
}
}
+// canCondSelectMinMaxF reports whether a float-typed phi can be turned into a
+// CondSelect that lowers to a single hardware min/max instruction. trueVal is
+// the phi argument chosen when cond is true, falseVal otherwise. The strict
+// forms "a < b ? a : b" (min) and "a < b ? b : a" (max) match MINSx/MAXSx
+// exactly, including their NaN and signed-zero behavior, so no fixup is needed.
+// Greater comparisons are canonicalized to Less with swapped operands, so only
+// Less needs to be matched here.
+func canCondSelectMinMaxF(arch string, t *types.Type, cond, trueVal, falseVal *Value) bool {
+ switch arch {
+ case "amd64", "arm64":
+ default:
+ return false
+ }
+ if !t.IsFloat() {
+ return false
+ }
+ switch cond.Op {
+ case OpLess32F, OpLess64F:
+ return (trueVal == cond.Args[0] && falseVal == cond.Args[1]) || // min
+ (trueVal == cond.Args[1] && falseVal == cond.Args[0]) // max
+ }
+ return false
+}
+
// elimIf converts the one-way branch starting at dom in f to a conditional move if possible.
// loadAddr is a set of values which are used to compute the address of a load.
// Those values are exempt from CMOV generation.
@@ -134,6 +161,9 @@
// Now decide if fusing 'simple' into dom+post
// looks profitable.
+ // Replace Phi instructions in b with CondSelect instructions
+ swap := (post.Preds[0].Block() == dom) != (dom.Succs[0].Block() == post)
+
// Check that there are Phis, and that all of them
// can be safely rewritten to CondSelect.
hasphis := false
@@ -141,7 +171,13 @@
if v.Op == OpPhi {
hasphis = true
if !canCondSelect(v, f.Config.arch, loadAddr) {
- return false
+ trueVal, falseVal := v.Args[0], v.Args[1]
+ if swap {
+ trueVal, falseVal = falseVal, trueVal
+ }
+ if !canCondSelectMinMaxF(f.Config.arch, v.Type, dom.Controls[0], trueVal, falseVal) {
+ return false
+ }
}
}
}
@@ -158,9 +194,6 @@
if len(simple.Values) > maxfuseinsts || !canSpeculativelyExecute(simple) {
return false
}
-
- // Replace Phi instructions in b with CondSelect instructions
- swap := (post.Preds[0].Block() == dom) != (dom.Succs[0].Block() == post)
for _, v := range post.Values {
if v.Op != OpPhi {
continue
@@ -326,12 +359,19 @@
if len(post.Preds) != 2 || post == b {
return false
}
+ swap := post.Preds[0].Block() != b.Succs[0].Block()
hasphis := false
for _, v := range post.Values {
if v.Op == OpPhi {
hasphis = true
if !canCondSelect(v, f.Config.arch, loadAddr) {
- return false
+ trueVal, falseVal := v.Args[0], v.Args[1]
+ if swap {
+ trueVal, falseVal = falseVal, trueVal
+ }
+ if !canCondSelectMinMaxF(f.Config.arch, v.Type, b.Controls[0], trueVal, falseVal) {
+ return false
+ }
}
}
}
@@ -345,7 +385,6 @@
}
// now we're committed: rewrite each Phi as a CondSelect
- swap := post.Preds[0].Block() != b.Succs[0].Block()
for _, v := range post.Values {
if v.Op != OpPhi {
continue
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index c113437..9440c12 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -937,6 +937,8 @@
OpAMD64VFMADD231SD
OpAMD64MINSD
OpAMD64MINSS
+ OpAMD64MAXSD
+ OpAMD64MAXSS
OpAMD64SBBQcarrymask
OpAMD64SBBLcarrymask
OpAMD64SETEQ
@@ -4915,6 +4917,8 @@
OpARM64CSINC
OpARM64CSINV
OpARM64CSNEG
+ OpARM64FCSELD
+ OpARM64FCSELS
OpARM64CSETM
OpARM64CCMP
OpARM64CCMN
@@ -17897,6 +17901,38 @@
},
},
{
+ name: "MAXSD",
+ argLen: 2,
+ resultInArg0: true,
+ earlyOk: true,
+ asm: x86.AMAXSD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, regMask{v1: 2147418112, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, regMask{v1: 2147418112, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, regMask{v1: 2147418112, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "MAXSS",
+ argLen: 2,
+ resultInArg0: true,
+ earlyOk: true,
+ asm: x86.AMAXSS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, regMask{v1: 2147418112, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, regMask{v1: 2147418112, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, regMask{v1: 2147418112, v2: 0}}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
name: "SBBQcarrymask",
argLen: 1,
earlyOk: true,
@@ -78741,6 +78777,38 @@
},
},
{
+ name: "FCSELD",
+ auxType: auxCCop,
+ argLen: 3,
+ earlyOk: true,
+ asm: arm64.AFCSELD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ {1, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
+ name: "FCSELS",
+ auxType: auxCCop,
+ argLen: 3,
+ earlyOk: true,
+ asm: arm64.AFCSELS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ {1, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, regMask{v1: 9223372034707292160, v2: 0}}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
+ {
name: "CSETM",
auxType: auxCCop,
argLen: 1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 9a45de3..6151a0f 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -95699,6 +95699,94 @@
v.AddArg3(y, x, cond)
return true
}
+ // match: (CondSelect <t> x y (SETGF (UCOMISD y x)))
+ // cond: is64BitFloat(t)
+ // result: (MINSD x y)
+ for {
+ t := v.Type
+ x := v_0
+ y := v_1
+ if v_2.Op != OpAMD64SETGF {
+ break
+ }
+ v_2_0 := v_2.Args[0]
+ if v_2_0.Op != OpAMD64UCOMISD {
+ break
+ }
+ _ = v_2_0.Args[1]
+ if y != v_2_0.Args[0] || x != v_2_0.Args[1] || !(is64BitFloat(t)) {
+ break
+ }
+ v.reset(OpAMD64MINSD)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (CondSelect <t> x y (SETGF (UCOMISS y x)))
+ // cond: is32BitFloat(t)
+ // result: (MINSS x y)
+ for {
+ t := v.Type
+ x := v_0
+ y := v_1
+ if v_2.Op != OpAMD64SETGF {
+ break
+ }
+ v_2_0 := v_2.Args[0]
+ if v_2_0.Op != OpAMD64UCOMISS {
+ break
+ }
+ _ = v_2_0.Args[1]
+ if y != v_2_0.Args[0] || x != v_2_0.Args[1] || !(is32BitFloat(t)) {
+ break
+ }
+ v.reset(OpAMD64MINSS)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (CondSelect <t> x y (SETGF (UCOMISD x y)))
+ // cond: is64BitFloat(t)
+ // result: (MAXSD x y)
+ for {
+ t := v.Type
+ x := v_0
+ y := v_1
+ if v_2.Op != OpAMD64SETGF {
+ break
+ }
+ v_2_0 := v_2.Args[0]
+ if v_2_0.Op != OpAMD64UCOMISD {
+ break
+ }
+ _ = v_2_0.Args[1]
+ if x != v_2_0.Args[0] || y != v_2_0.Args[1] || !(is64BitFloat(t)) {
+ break
+ }
+ v.reset(OpAMD64MAXSD)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (CondSelect <t> x y (SETGF (UCOMISS x y)))
+ // cond: is32BitFloat(t)
+ // result: (MAXSS x y)
+ for {
+ t := v.Type
+ x := v_0
+ y := v_1
+ if v_2.Op != OpAMD64SETGF {
+ break
+ }
+ v_2_0 := v_2.Args[0]
+ if v_2_0.Op != OpAMD64UCOMISS {
+ break
+ }
+ _ = v_2_0.Args[1]
+ if x != v_2_0.Args[0] || y != v_2_0.Args[1] || !(is32BitFloat(t)) {
+ break
+ }
+ v.reset(OpAMD64MAXSS)
+ v.AddArg2(x, y)
+ return true
+ }
// match: (CondSelect <t> x y check)
// cond: !check.Type.IsFlags() && check.Type.Size() == 8 && (is64BitInt(t) || isPtr(t))
// result: (CMOVQNE y x (CMPQconst [0] check))
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index 428d313..ee36dbe 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -106,6 +106,10 @@
return rewriteValueARM64_OpARM64FCMPD(v)
case OpARM64FCMPS:
return rewriteValueARM64_OpARM64FCMPS(v)
+ case OpARM64FCSELD:
+ return rewriteValueARM64_OpARM64FCSELD(v)
+ case OpARM64FCSELS:
+ return rewriteValueARM64_OpARM64FCSELS(v)
case OpARM64FCVTDS:
return rewriteValueARM64_OpARM64FCVTDS(v)
case OpARM64FLDPQ:
@@ -6132,6 +6136,48 @@
}
return false
}
+func rewriteValueARM64_OpARM64FCSELD(v *Value) bool {
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (FCSELD [cc] x y (InvertFlags cmp))
+ // result: (FCSELD [arm64Invert(cc)] x y cmp)
+ for {
+ cc := auxIntToOp(v.AuxInt)
+ x := v_0
+ y := v_1
+ if v_2.Op != OpARM64InvertFlags {
+ break
+ }
+ cmp := v_2.Args[0]
+ v.reset(OpARM64FCSELD)
+ v.AuxInt = opToAuxInt(arm64Invert(cc))
+ v.AddArg3(x, y, cmp)
+ return true
+ }
+ return false
+}
+func rewriteValueARM64_OpARM64FCSELS(v *Value) bool {
+ v_2 := v.Args[2]
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (FCSELS [cc] x y (InvertFlags cmp))
+ // result: (FCSELS [arm64Invert(cc)] x y cmp)
+ for {
+ cc := auxIntToOp(v.AuxInt)
+ x := v_0
+ y := v_1
+ if v_2.Op != OpARM64InvertFlags {
+ break
+ }
+ cmp := v_2.Args[0]
+ v.reset(OpARM64FCSELS)
+ v.AuxInt = opToAuxInt(arm64Invert(cc))
+ v.AddArg3(x, y, cmp)
+ return true
+ }
+ return false
+}
func rewriteValueARM64_OpARM64FCVTDS(v *Value) bool {
v_0 := v.Args[0]
// match: (FCVTDS (FABSD (FCVTSD x)))
@@ -19885,13 +19931,43 @@
v_0 := v.Args[0]
b := v.Block
// match: (CondSelect x y boolval)
- // cond: flagArg(boolval) != nil
+ // cond: is64BitFloat(v.Type) && flagArg(boolval) != nil
+ // result: (FCSELD [boolval.Op] x y flagArg(boolval))
+ for {
+ x := v_0
+ y := v_1
+ boolval := v_2
+ if !(is64BitFloat(v.Type) && flagArg(boolval) != nil) {
+ break
+ }
+ v.reset(OpARM64FCSELD)
+ v.AuxInt = opToAuxInt(boolval.Op)
+ v.AddArg3(x, y, flagArg(boolval))
+ return true
+ }
+ // match: (CondSelect x y boolval)
+ // cond: is32BitFloat(v.Type) && flagArg(boolval) != nil
+ // result: (FCSELS [boolval.Op] x y flagArg(boolval))
+ for {
+ x := v_0
+ y := v_1
+ boolval := v_2
+ if !(is32BitFloat(v.Type) && flagArg(boolval) != nil) {
+ break
+ }
+ v.reset(OpARM64FCSELS)
+ v.AuxInt = opToAuxInt(boolval.Op)
+ v.AddArg3(x, y, flagArg(boolval))
+ return true
+ }
+ // match: (CondSelect x y boolval)
+ // cond: !v.Type.IsFloat() && flagArg(boolval) != nil
// result: (CSEL [boolval.Op] x y flagArg(boolval))
for {
x := v_0
y := v_1
boolval := v_2
- if !(flagArg(boolval) != nil) {
+ if !(!v.Type.IsFloat() && flagArg(boolval) != nil) {
break
}
v.reset(OpARM64CSEL)
@@ -19900,13 +19976,13 @@
return true
}
// match: (CondSelect x y boolval)
- // cond: flagArg(boolval) == nil
+ // cond: !v.Type.IsFloat() && flagArg(boolval) == nil
// result: (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval))
for {
x := v_0
y := v_1
boolval := v_2
- if !(flagArg(boolval) == nil) {
+ if !(!v.Type.IsFloat() && flagArg(boolval) == nil) {
break
}
v.reset(OpARM64CSEL)
diff --git a/test/codegen/floats.go b/test/codegen/floats.go
index 09ef427..7bf15f6 100644
--- a/test/codegen/floats.go
+++ b/test/codegen/floats.go
@@ -215,6 +215,49 @@
return max(a, b)
}
+// The "a < b ? a : b" idiom has the same NaN and signed-zero behavior as a
+// single MINSD/MINSS, so it should compile to one instruction rather than a
+// compare and branch.
+func Float64MinBranch(a, b float64) float64 {
+ r := b
+ // amd64:"MINSD"
+ // arm64:"FCSELD"
+ if a < b {
+ r = a
+ }
+ return r
+}
+
+func Float32MinBranch(a, b float32) float32 {
+ r := b
+ // amd64:"MINSS"
+ // arm64:"FCSELS"
+ if a < b {
+ r = a
+ }
+ return r
+}
+
+func Float64MaxBranch(a, b float64) float64 {
+ r := b
+ // amd64:"MAXSD"
+ // arm64:"FCSELD"
+ if a > b {
+ r = a
+ }
+ return r
+}
+
+func Float32MaxBranch(a, b float32) float32 {
+ r := b
+ // amd64:"MAXSS"
+ // arm64:"FCSELS"
+ if a > b {
+ r = a
+ }
+ return r
+}
+
// ------------------------ //
// Constant Optimizations //
// ------------------------ //
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Abhinav Srivastava abandoned this change.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
cmd/compile: lower float min/max branch idiom to MINSD/MAXSD/FCSEL
The compiler emits a compare-and-branch for the float min/max idiom
(r := b; if a < b { r = a }) instead of the single hardware instruction
the CPU provides. The idiom "a < b ? a : b" has the same NaN and
signed-zero semantics as MINSD/MINSS (unordered or equal yields the
second operand), so unlike the min builtin it needs no fixup.
branchelim now forms a float CondSelect for this exact shape (strict <,
the compared operands as the select arms, both min and max orderings;
a > b canonicalizes to b < a). amd64 lowers to MINSD/MINSS and new
MAXSD/MAXSS ops; arm64 lowers to FCSELD/FCSELS (FMIND/FMINS can't be
used: they propagate NaN per IEEE, disagreeing with the branch).
riscv64 is left branching as it has no FP conditional-select.
Adds a runtime test checking the idiom against an unoptimized reference
for all NaN, signed-zero and infinity combinations, and codegen tests
asserting the branch is eliminated (and that <=/>= and riscv64 are not
transformed).
Fixes #72831
index 3c773e1..c10de9f 100644
--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
@@ -298,9 +298,18 @@
(FCMPD x (FMOVDconst [0])) => (FCMPD0 x)
(FCMPD (FMOVDconst [0]) x) => (InvertFlags (FCMPD0 x))
+// Float selects use FCSEL on the FP registers. The only producer of a
+// float-typed CondSelect is branchelim's canCondSelectMinMaxF gate, which
+// admits just the "a < b ? a : b" min/max idiom; its flag-generating compare
+// is always present, so no TSTW fixup is needed. If a float CondSelect were
+// ever produced from a non-compare condition, flagArg would be nil and the
+// value would (correctly) not match these rules.
+(CondSelect x y boolval) && is64BitFloat(v.Type) && flagArg(boolval) != nil => (FCSELD [boolval.Op] x y flagArg(boolval))
+(CondSelect x y boolval) && is32BitFloat(v.Type) && flagArg(boolval) != nil => (FCSELS [boolval.Op] x y flagArg(boolval))
+
// CSEL needs a flag-generating argument. Synthesize a TSTW if necessary.
-(CondSelect x y boolval) && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval))
-(CondSelect x y boolval) && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval))
+(CondSelect x y boolval) && !v.Type.IsFloat() && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval))
+(CondSelect x y boolval) && !v.Type.IsFloat() && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval))
(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVDaddr [int32(off)] ptr)
(OffPtr [off] ptr) => (ADDconst [off] ptr)
@@ -1351,6 +1360,8 @@
// absorb InvertFlags into conditional instructions
(CSEL [cc] x y (InvertFlags cmp)) => (CSEL [arm64Invert(cc)] x y cmp)
(CSEL0 [cc] x (InvertFlags cmp)) => (CSEL0 [arm64Invert(cc)] x cmp)
+(FCSELD [cc] x y (InvertFlags cmp)) => (FCSELD [arm64Invert(cc)] x y cmp)
+(FCSELS [cc] x y (InvertFlags cmp)) => (FCSELS [arm64Invert(cc)] x y cmp)
(CSETM [cc] (InvertFlags cmp)) => (CSETM [arm64Invert(cc)] cmp)
(CSINC [cc] x y (InvertFlags cmp)) => (CSINC [arm64Invert(cc)] x y cmp)
(CSINV [cc] x y (InvertFlags cmp)) => (CSINV [arm64Invert(cc)] x y cmp)
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
index 04d960f..6edbaa0 100644
--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
@@ -521,6 +521,8 @@
{name: "CSINC", argLength: 3, reg: gp2flags1, asm: "CSINC", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : arg1 + 1
{name: "CSINV", argLength: 3, reg: gp2flags1, asm: "CSINV", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : ^arg1
{name: "CSNEG", argLength: 3, reg: gp2flags1, asm: "CSNEG", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : -arg1
+ {name: "FCSELD", argLength: 3, reg: fp21, asm: "FCSELD", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : arg1, 64-bit float
+ {name: "FCSELS", argLength: 3, reg: fp21, asm: "FCSELS", aux: "CCop", earlyOk: true}, // auxint(flags) ? arg0 : arg1, 32-bit float
{name: "CSETM", argLength: 1, reg: readflags, asm: "CSETM", aux: "CCop", earlyOk: true}, // auxint(flags) ? -1 : 0
// conditional comparison instructions; auxint is
diff --git a/src/cmd/compile/internal/ssa/branchelim.go b/src/cmd/compile/internal/ssa/branchelim.go
index 8c411b5..3da0faf 100644
--- a/src/cmd/compile/internal/ssa/branchelim.go
+++ b/src/cmd/compile/internal/ssa/branchelim.go
@@ -4,7 +4,10 @@
package ssa
-import "cmd/internal/src"
+import (
+ "cmd/compile/internal/types"
+ "cmd/internal/src"
+)
// branchelim tries to eliminate branches by
// generating CondSelect instructions.
@@ -107,6 +110,30 @@
}
}
+// canCondSelectMinMaxF reports whether a float-typed phi can be turned into a
+// CondSelect that lowers to a single hardware min/max instruction. trueVal is
+// the phi argument chosen when cond is true, falseVal otherwise. The strict
+// forms "a < b ? a : b" (min) and "a < b ? b : a" (max) match MINSx/MAXSx
+// exactly, including their NaN and signed-zero behavior, so no fixup is needed.
+// Greater comparisons are canonicalized to Less with swapped operands during
+// SSA building, so only Less needs to be matched here.
diff --git a/src/runtime/minmax_test.go b/src/runtime/minmax_test.go
index 1f815a8..681d559 100644
--- a/src/runtime/minmax_test.go
+++ b/src/runtime/minmax_test.go
@@ -128,6 +128,106 @@
test(2, 1, 0)
}
+// The "a < b ? a : b" branch idiom is lowered to a single min/max
+// instruction (MINSD/MAXSD on amd64, FCSEL on arm64). Unlike the min/max
+// builtins it keeps the branch's own NaN and signed-zero behavior, so test
+// it against an unoptimized reference. The assign forms below match the phi
+// shape the optimization rewrites; the reference forms use early returns,
+// which do not form a phi and are left as a compare and branch.
+
+//go:noinline
+func minBranch64(a, b float64) float64 {
+ r := b
+ if a < b {
+ r = a
+ }
+ return r
+}
+
+//go:noinline
+func maxBranch64(a, b float64) float64 {
+ r := b
+ if a > b {
+ r = a
+ }
+ return r
+}
+
+//go:noinline
+func minBranch32(a, b float32) float32 {
+ r := b
+ if a < b {
+ r = a
+ }
+ return r
+}
+
+//go:noinline
+func maxBranch32(a, b float32) float32 {
+ r := b
+ if a > b {
+ r = a
+ }
+ return r
+}
+
+func minRef64(a, b float64) float64 {
+ if a < b {
+ return a
+ }
+ return b
+}
+
+func maxRef64(a, b float64) float64 {
+ if a > b {
+ return a
+ }
+ return b
+}
+
+func sameFloat64(x, y float64) bool {
+ if math.IsNaN(x) && math.IsNaN(y) {
+ return true
+ }
+ return x == y && math.Signbit(x) == math.Signbit(y)
+}
+
+func sameFloat32(x, y float32) bool {
+ xf, yf := float64(x), float64(y)
+ if math.IsNaN(xf) && math.IsNaN(yf) {
+ return true
+ }
+ return x == y && math.Signbit(xf) == math.Signbit(yf)
+}
+
+func TestFloatMinMaxBranchIdiom(t *testing.T) {
+ for _, a := range all {
+ for _, b := range all {
+ if got, want := minBranch64(a, b), minRef64(a, b); !sameFloat64(got, want) {
+ t.Errorf("minBranch64(%v, %v) = %v, want %v", a, b, got, want)
+ }
+ if got, want := maxBranch64(a, b), maxRef64(a, b); !sameFloat64(got, want) {
+ t.Errorf("maxBranch64(%v, %v) = %v, want %v", a, b, got, want)
+ }
+ a32, b32 := float32(a), float32(b)
+ wantMin := b32
+ if a32 < b32 {
+ wantMin = a32
+ }
+ if got := minBranch32(a32, b32); !sameFloat32(got, wantMin) {
+ t.Errorf("minBranch32(%v, %v) = %v, want %v", a32, b32, got, wantMin)
+ }
+ wantMax := b32
+ if a32 > b32 {
+ wantMax = a32
+ }
+ if got := maxBranch32(a32, b32); !sameFloat32(got, wantMax) {
+ t.Errorf("maxBranch32(%v, %v) = %v, want %v", a32, b32, got, wantMax)
+ }
+ }
+ }
+}
+
func BenchmarkMinFloat(b *testing.B) {
var m float64 = 0
for i := 0; i < b.N; i++ {
diff --git a/test/codegen/floats.go b/test/codegen/floats.go
index 09ef427..ddccc4d 100644
--- a/test/codegen/floats.go
+++ b/test/codegen/floats.go
@@ -215,6 +215,85 @@
return max(a, b)
}
+// The "a < b ? a : b" idiom has the same NaN and signed-zero behavior as a
+// single MINSD/MINSS, so it should compile to one instruction rather than a
+// compare and branch.
+func Float64MinBranch(a, b float64) float64 {
+ r := b
+ // amd64:"MINSD" -"UCOMISD" -"JLS" -"JCC"
+ // arm64:"FCSELD" -"BLT" -"BGT"
+ if a < b {
+ r = a
+ }
+ return r
+}
+
+func Float32MinBranch(a, b float32) float32 {
+ r := b
+ // amd64:"MINSS" -"UCOMISS"
+ // arm64:"FCSELS"
+ if a < b {
+ r = a
+ }
+ return r
+}
+
+func Float64MaxBranch(a, b float64) float64 {
+ r := b
+ // amd64:"MAXSD" -"UCOMISD" -"JLS" -"JCC"
+ // arm64:"FCSELD" -"BLT" -"BGT"
+ if a > b {
+ r = a
+ }
+ return r
+}
+
+func Float32MaxBranch(a, b float32) float32 {
+ r := b
+ // amd64:"MAXSS" -"UCOMISS"
+ // arm64:"FCSELS"
+ if a > b {
+ r = a
+ }
+ return r
+}
+
+// The if/else form lowers the same way as the assign form above.
+func Float64MinIfElse(a, b float64) float64 {
+ var r float64
+ // amd64:"MINSD"
+ // arm64:"FCSELD"
+ if a < b {
+ r = a
+ } else {
+ r = b
+ }
+ return r
+}
+
+// Non-strict comparisons (<=, >=) take the branch on equal operands, so they
+// do NOT match MINSD/MAXSD semantics (which prefer the second operand) and
+// must be left as a compare and branch.
+func Float64MinLeqNotFused(a, b float64) float64 {
+ r := b
+ // amd64:-"MINSD"
+ // arm64:-"FCSELD"
+ if a <= b {
+ r = a
+ }
+ return r
+}
+
+// riscv64 has no FP conditional-select, so the idiom stays a branch there.
+func Float64MinBranchRISCV(a, b float64) float64 {
+ r := b
+ // riscv64:-"FMIN"
+ if a < b {
+ r = a
+ }
+ return r
+}
+
// ------------------------ //
// Constant Optimizations //
// ------------------------ //
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
(CondSelect <t> x y (SETGF (UCOMISD y x))) && is64BitFloat(t) => (MINSD x y)This does worry me here that we are only matching very specific conditions. What happens if that condition isn't matched? We need some way to handle that case.
For instance, what if (UCOMISD y x) or even (SETGF (UCOMISD x y)) gets constant-folded away, or the prove pass decides it knows the answer and rewrites it to a constant?
We generally want lowering rules that are guaranteed to always trigger.
Best is to do:
(GenericFoo x y) => (LOWEREDFOO x y)
(LOWEREDFOO x (SOMEOP y)) => (LOWEREDFOO_PLUSOP x y)
So the generic op lowering is always available, but can be further optimized.
(This may require introducing float-point conditional move instructions, which exist on amd64 but only for v3?)
Still ok, is to do:
(GenericFoo x y) && condition => (LOWEREDOP1 x y)
(GenericFoo x y) && !condition => (LOWEREDOP2 x y)
So at least it is obvious that lowering will always happen. The set of conditions on the rules are ideally non-overlapping and cover every possibility.
(For what it is worth, our current CondSelect rules kind of have this problem already. I'm not sure I understand how they always successfully lower.)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
(CondSelect <t> x y (SETGF (UCOMISD y x))) && is64BitFloat(t) => (MINSD x y)This does worry me here that we are only matching very specific conditions. What happens if that condition isn't matched? We need some way to handle that case.
For instance, what if (UCOMISD y x) or even (SETGF (UCOMISD x y)) gets constant-folded away, or the prove pass decides it knows the answer and rewrites it to a constant?
We generally want lowering rules that are guaranteed to always trigger.
Best is to do:
(GenericFoo x y) => (LOWEREDFOO x y)
(LOWEREDFOO x (SOMEOP y)) => (LOWEREDFOO_PLUSOP x y)So the generic op lowering is always available, but can be further optimized.
(This may require introducing float-point conditional move instructions, which exist on amd64 but only for v3?)Still ok, is to do:
(GenericFoo x y) && condition => (LOWEREDOP1 x y)
(GenericFoo x y) && !condition => (LOWEREDOP2 x y)So at least it is obvious that lowering will always happen. The set of conditions on the rules are ideally non-overlapping and cover every possibility.
(For what it is worth, our current CondSelect rules kind of have this problem already. I'm not sure I understand how they always successfully lower.)
Didn't think of that. I can switch to a dedicated min/max op that `branchelim` emits directly, so it always lowers (MINSD/MAXSD on amd64, FCSEL on arm64) with no condition to fold. Also, since min/max have baseline instructions on both arches, I think this avoids needing a general float cmov, but let me know if I'm missing something.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |