cmd/compile: optimize 32-bit unsigned division by pre-shifted magic constant
For unsigned 32-bit division by a constant d on 64-bit targets (amd64,
arm64, riscv64, ...), when the magic multiplier M = 2^32 + m has m odd
(i.e. M is truly 33-bit and cannot be halved to fit in 32 bits), use a
pre-shifted constant C = M << (32 - s) so that a single Hmul64u gives
the quotient directly:
x / d = Hmul64u(ZeroExt32to64(x), C)
This replaces the avg-based sequence for odd divisors (Case 3) and the
SHR+IMULQ+SHR sequence for even divisors whose underlying magic is
33-bit (e.g. d=14=2×7). Divisors where m is even (Case 6) keep their
existing 32-bit constant path to avoid loading a large 64-bit constant
on arm64/riscv64.
Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):
BenchmarkOdd (d=7,19,107): 3.07 ns/op → 1.92 ns/op (~38% faster)
BenchmarkEven (d=14,38,214): 2.04 ns/op → 1.92 ns/op (~6% faster)
Ref: https://arxiv.org/abs/2604.07902
https://github.com/llvm/llvm-project/pull/181288
Fixes #79780
diff --git a/src/cmd/compile/internal/ssa/_gen/divmod.rules b/src/cmd/compile/internal/ssa/_gen/divmod.rules
index f06a435..976599a 100644
--- a/src/cmd/compile/internal/ssa/_gen/divmod.rules
+++ b/src/cmd/compile/internal/ssa/_gen/divmod.rules
@@ -182,7 +182,20 @@
(Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(1<<63 + umagic64(c).m/2)]))
(Const64 <typ.UInt64> [umagic64(c).s - 1]))
+// Pre-shifted: on non-wasm 64-bit targets, when m is odd (magic is truly 33-bit and cannot
+// be halved to fit in 32 bits). This covers odd divisors (Case 3) and even divisors whose
+// underlying magic is 33-bit (e.g. d=14=2x7 shares the same m as d=7).
+// When m is even (Case 6), the 32-bit constant path above is preferred to avoid loading
+// a large 64-bit constant on aarch64/riscv64.
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name != "wasm" && umagic32(c).m&1 != 0 =>
+ (Trunc64to32 <t>
+ (Hmul64u <typ.UInt64>
+ (ZeroExt32to64 x)
+ (Const64 <typ.UInt64> [int64(umagic32PreShifted(c))])))
+
// Case 7. Unsigned divide where c is even.
+// For non-wasm 64-bit with m odd, the pre-shifted rule above takes priority.
+// This rule handles wasm (m odd) and 32-bit targets.
(Div16u <t> x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && c&1 == 0 =>
(Trunc32to16 <t>
(Rsh32Ux64 <typ.UInt32>
@@ -218,7 +231,7 @@
(Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16]))
(Mul32 <typ.UInt32> (ZeroExt16to32 x) (Const32 <typ.UInt32> [int32(umagic16(c).m)])))
(Const64 <typ.UInt64> [16 + umagic16(c).s - 1])))
-(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 =>
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name == "wasm" =>
(Trunc64to32 <t>
(Rsh64Ux64 <typ.UInt64>
(Avg64u
diff --git a/src/cmd/compile/internal/ssa/magic.go b/src/cmd/compile/internal/ssa/magic.go
index 29a57fb..2d4f572 100644
--- a/src/cmd/compile/internal/ssa/magic.go
+++ b/src/cmd/compile/internal/ssa/magic.go
@@ -136,6 +136,19 @@
func umagic32(c int32) umagicData { return umagic(32, int64(c)) }
func umagic64(c int64) umagicData { return umagic(64, c) }
+// umagic32PreShifted returns the pre-shifted 64-bit magic constant for unsigned 32-bit
+// division by c on 64-bit targets that have a native 64x64->128-bit multiply instruction
+// (amd64 MULQ, arm64 UMULH, riscv64 MULHU, etc.), enabling:
+//
+// x / c = Hmul64u(ZeroExt32to64(x), umagic32PreShifted(c))
+//
+// Given umagic32(c) returning m and s, the constant is (2^32 + m) << (32 - s).
+// Valid when umagicOK32(c) is true. Result always fits in uint64.
+func umagic32PreShifted(c int32) uint64 {
+ magic := umagic32(c)
+ return (uint64(1)<<32 + magic.m) << uint(32-magic.s)
+}
+
// For signed division, we use a similar strategy.
// First, we enforce a positive c.
// x / c = -(x / (-c))
diff --git a/src/cmd/compile/internal/ssa/rewritedivmod.go b/src/cmd/compile/internal/ssa/rewritedivmod.go
index 1ee39b3..6a4e803 100644
--- a/src/cmd/compile/internal/ssa/rewritedivmod.go
+++ b/src/cmd/compile/internal/ssa/rewritedivmod.go
@@ -487,6 +487,30 @@
return true
}
// match: (Div32u <t> x (Const32 [c]))
+ // cond: umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name != "wasm" && umagic32(c).m&1 != 0
+ // result: (Trunc64to32 <t> (Hmul64u <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [int64(umagic32PreShifted(c))])))
+ for {
+ t := v.Type
+ x := v_0
+ if v_1.Op != OpConst32 {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ if !(umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name != "wasm" && umagic32(c).m&1 != 0) {
+ break
+ }
+ v.reset(OpTrunc64to32)
+ v.Type = t
+ v0 := b.NewValue0(v.Pos, OpHmul64u, typ.UInt64)
+ v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+ v1.AddArg(x)
+ v2 := b.NewValue0(v.Pos, OpConst64, typ.UInt64)
+ v2.AuxInt = int64ToAuxInt(int64(umagic32PreShifted(c)))
+ v0.AddArg2(v1, v2)
+ v.AddArg(v0)
+ return true
+ }
+ // match: (Div32u <t> x (Const32 [c]))
// cond: umagicOK32(c) && config.RegSize == 8 && c&1 == 0
// result: (Trunc64to32 <t> (Rsh64Ux64 <typ.UInt64> (Mul64 <typ.UInt64> (Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1])) (Const64 <typ.UInt64> [int64(1<<31 + (umagic32(c).m+1)/2)])) (Const64 <typ.UInt64> [32 + umagic32(c).s - 2])))
for {
@@ -547,7 +571,7 @@
return true
}
// match: (Div32u <t> x (Const32 [c]))
- // cond: umagicOK32(c) && config.RegSize == 8
+ // cond: umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name == "wasm"
// result: (Trunc64to32 <t> (Rsh64Ux64 <typ.UInt64> (Avg64u (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32])) (Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [int64(umagic32(c).m)]))) (Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
for {
t := v.Type
@@ -556,7 +580,7 @@
break
}
c := auxIntToInt32(v_1.AuxInt)
- if !(umagicOK32(c) && config.RegSize == 8) {
+ if !(umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name == "wasm") {
break
}
v.reset(OpTrunc64to32)
diff --git a/test/codegen/divmod.go b/test/codegen/divmod.go
index adcba5d..05ee930 100644
--- a/test/codegen/divmod.go
+++ b/test/codegen/divmod.go
@@ -325,14 +325,29 @@
// 386: "SHRL [$]1,"
// 386: "MOVL [$]-1840700269,"
// 386: "SHRL [$]2,"
- // arm64: "UBFX [$]1, R[0-9]+, [$]31,"
- // arm64: "MOVD [$]2454267027,"
- // arm64: "MUL"
- // arm64: "LSR [$]34,"
+ // amd64: "MOVQ [$]1317624576808583168,"
+ // amd64: "MULQ"
+ // amd64: -"SHRQ"
+ // arm64: "MOVD [$]1317624576808583168,"
+ // arm64: "UMULH"
+ // arm64: -"UBFX"
+ // arm64: -"LSR"
// wasm: "I64Const [$]2454267027"
return i / 14
}
+func div28_uint32(i uint32) uint32 {
+ // amd64: "MOVQ [$]658812288404291584,"
+ // amd64: "MULQ"
+ // amd64: -"SHRQ"
+ // arm64: "MOVD [$]658812288404291584,"
+ // arm64: "UMULH"
+ // arm64: -"UBFX"
+ // arm64: -"LSR"
+ // wasm: "I64Const [$]2454267027"
+ return i / 28
+}
+
func div14_uint64(i uint64) uint64 {
// 386: "MOVL [$]-1840700270,"
// 386: "MULL"
@@ -364,12 +379,13 @@
// 386: "ADDL"
// 386: "RCRL [$]1,"
// 386: "SHRL [$]2,"
- // arm64: "UBFIZ [$]32, R[0-9]+, [$]32,"
- // arm64: "MOVD [$]613566757,"
- // arm64: "MUL"
- // arm64: "SUB"
- // arm64: "ADD R[0-9]+>>1,"
- // arm64: "LSR [$]34,"
+ // amd64: "MOVQ [$]2635249153617166336,"
+ // amd64: "MULQ"
+ // amd64: -"RCR"
+ // arm64: "MOVD [$]2635249153617166336,"
+ // arm64: "UMULH"
+ // arm64: -"UBFIZ"
+ // arm64: -"LSR"
// wasm: "I64Const [$]613566757"
return i / 7
}
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Congratulations on opening your first change. Thank you for your contribution!
Next steps:
A maintainer will review your change and provide feedback. See
https://go.dev/doc/contribute#review for more info and tips to get your
patch through code review.
Most changes in the Go project go through a few rounds of revision. This can be
surprising to people new to the project. The careful, iterative review process
is our way of helping mentor contributors and ensuring that their contributions
have a lasting impact.
During May-July and Nov-Jan the Go project is in a code freeze, during which
little code gets reviewed or merged. If a reviewer responds with a comment like
R=go1.11 or adds a tag like "wait-release", it means that this CL will be
reviewed as part of the next development cycle. See https://go.dev/s/release
for more details.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):Where is the benchmark?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Sorry for the late reply. I had commented right away but it was left in draft state.
Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):Where is the benchmark?
The benchmark I used is not part of this CL. Here it is for reference:
```go
package gobench
import (
"os"
"testing"
)
var sink uint32
// BenchmarkOdd: divisors
func BenchmarkOdd(b *testing.B) {
ret := uint32(len(os.Args)) * 0x12345678
for i := 0; i < b.N; i++ {
x := uint32(i) ^ ret
ret ^= x / 7
ret ^= x / 19
ret ^= x / 107
}
sink = ret
}
// BenchmarkEven: divisors 14, 38, 214
func BenchmarkEven(b *testing.B) {
ret := uint32(len(os.Args)) * 0x12345678
for i := 0; i < b.N; i++ {
x := uint32(i) ^ ret
ret ^= x / 14
ret ^= x / 38
ret ^= x / 214
}
sink = ret
}
```
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
We generally put the benchmark code in the CL, so we have that benchmark available for future improvements. Probably cmd/compile/internal/test/math_test.go would be the right place.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):Done. Added BenchmarkDiv32UnsignedConstOdd and BenchmarkDiv32UnsignedConstEven to cmd/compile/internal/test/math_test.go.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |