shigeo MITSUNARI (Gerrit)

unread,

Jun 1, 2026, 11:16:02 PM (8 days ago) Jun 1

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

shigeo MITSUNARI has uploaded the change for review

Commit message

cmd/compile: optimize 32-bit unsigned division by pre-shifted magic constant

For unsigned 32-bit division by a constant d on 64-bit targets (amd64,
arm64, riscv64, ...), when the magic multiplier M = 2^32 + m has m odd
(i.e. M is truly 33-bit and cannot be halved to fit in 32 bits), use a
pre-shifted constant C = M << (32 - s) so that a single Hmul64u gives
the quotient directly:

      x / d = Hmul64u(ZeroExt32to64(x), C)

This replaces the avg-based sequence for odd divisors (Case 3) and the
SHR+IMULQ+SHR sequence for even divisors whose underlying magic is
33-bit (e.g. d=14=2×7). Divisors where m is even (Case 6) keep their
existing 32-bit constant path to avoid loading a large 64-bit constant
on arm64/riscv64.

Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):
  BenchmarkOdd  (d=7,19,107):   3.07 ns/op → 1.92 ns/op  (~38% faster)
  BenchmarkEven (d=14,38,214):  2.04 ns/op → 1.92 ns/op  (~6% faster)

Ref: https://arxiv.org/abs/2604.07902
     https://github.com/llvm/llvm-project/pull/181288

Fixes #79780

Change-Id: I33f551f4f6715ddbcc39f9f56c1331f31f0eca33

Change diff

diff --git a/src/cmd/compile/internal/ssa/_gen/divmod.rules b/src/cmd/compile/internal/ssa/_gen/divmod.rules
index f06a435..976599a 100644
--- a/src/cmd/compile/internal/ssa/_gen/divmod.rules
+++ b/src/cmd/compile/internal/ssa/_gen/divmod.rules
@@ -182,7 +182,20 @@
     (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(1<<63 + umagic64(c).m/2)]))
     (Const64 <typ.UInt64> [umagic64(c).s - 1]))
 
+// Pre-shifted: on non-wasm 64-bit targets, when m is odd (magic is truly 33-bit and cannot
+// be halved to fit in 32 bits). This covers odd divisors (Case 3) and even divisors whose
+// underlying magic is 33-bit (e.g. d=14=2x7 shares the same m as d=7).
+// When m is even (Case 6), the 32-bit constant path above is preferred to avoid loading
+// a large 64-bit constant on aarch64/riscv64.
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name != "wasm" && umagic32(c).m&1 != 0 =>
+  (Trunc64to32 <t>
+    (Hmul64u <typ.UInt64>
+      (ZeroExt32to64 x)
+      (Const64 <typ.UInt64> [int64(umagic32PreShifted(c))])))
+
 // Case 7. Unsigned divide where c is even.
+// For non-wasm 64-bit with m odd, the pre-shifted rule above takes priority.
+// This rule handles wasm (m odd) and 32-bit targets.
 (Div16u <t> x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && c&1 == 0 =>
   (Trunc32to16 <t>
     (Rsh32Ux64 <typ.UInt32>
@@ -218,7 +231,7 @@
         (Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16]))
         (Mul32 <typ.UInt32> (ZeroExt16to32 x) (Const32 <typ.UInt32> [int32(umagic16(c).m)])))
       (Const64 <typ.UInt64> [16 + umagic16(c).s - 1])))
-(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 =>
+(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name == "wasm" =>
   (Trunc64to32 <t>
     (Rsh64Ux64 <typ.UInt64>
       (Avg64u
diff --git a/src/cmd/compile/internal/ssa/magic.go b/src/cmd/compile/internal/ssa/magic.go
index 29a57fb..2d4f572 100644
--- a/src/cmd/compile/internal/ssa/magic.go
+++ b/src/cmd/compile/internal/ssa/magic.go
@@ -136,6 +136,19 @@
 func umagic32(c int32) umagicData { return umagic(32, int64(c)) }
 func umagic64(c int64) umagicData { return umagic(64, c) }
 
+// umagic32PreShifted returns the pre-shifted 64-bit magic constant for unsigned 32-bit
+// division by c on 64-bit targets that have a native 64x64->128-bit multiply instruction
+// (amd64 MULQ, arm64 UMULH, riscv64 MULHU, etc.), enabling:
+//
+//	x / c = Hmul64u(ZeroExt32to64(x), umagic32PreShifted(c))
+//
+// Given umagic32(c) returning m and s, the constant is (2^32 + m) << (32 - s).
+// Valid when umagicOK32(c) is true. Result always fits in uint64.
+func umagic32PreShifted(c int32) uint64 {
+	magic := umagic32(c)
+	return (uint64(1)<<32 + magic.m) << uint(32-magic.s)
+}
+
 // For signed division, we use a similar strategy.
 // First, we enforce a positive c.
 //   x / c = -(x / (-c))
diff --git a/src/cmd/compile/internal/ssa/rewritedivmod.go b/src/cmd/compile/internal/ssa/rewritedivmod.go
index 1ee39b3..6a4e803 100644
--- a/src/cmd/compile/internal/ssa/rewritedivmod.go
+++ b/src/cmd/compile/internal/ssa/rewritedivmod.go
@@ -487,6 +487,30 @@
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
+	// cond: umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name != "wasm" && umagic32(c).m&1 != 0
+	// result: (Trunc64to32 <t> (Hmul64u <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [int64(umagic32PreShifted(c))])))
+	for {
+		t := v.Type
+		x := v_0
+		if v_1.Op != OpConst32 {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		if !(umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name != "wasm" && umagic32(c).m&1 != 0) {
+			break
+		}
+		v.reset(OpTrunc64to32)
+		v.Type = t
+		v0 := b.NewValue0(v.Pos, OpHmul64u, typ.UInt64)
+		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+		v1.AddArg(x)
+		v2 := b.NewValue0(v.Pos, OpConst64, typ.UInt64)
+		v2.AuxInt = int64ToAuxInt(int64(umagic32PreShifted(c)))
+		v0.AddArg2(v1, v2)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (Div32u <t> x (Const32 [c]))
 	// cond: umagicOK32(c) && config.RegSize == 8 && c&1 == 0
 	// result: (Trunc64to32 <t> (Rsh64Ux64 <typ.UInt64> (Mul64 <typ.UInt64> (Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1])) (Const64 <typ.UInt64> [int64(1<<31 + (umagic32(c).m+1)/2)])) (Const64 <typ.UInt64> [32 + umagic32(c).s - 2])))
 	for {
@@ -547,7 +571,7 @@
 		return true
 	}
 	// match: (Div32u <t> x (Const32 [c]))
-	// cond: umagicOK32(c) && config.RegSize == 8
+	// cond: umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name == "wasm"
 	// result: (Trunc64to32 <t> (Rsh64Ux64 <typ.UInt64> (Avg64u (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32])) (Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [int64(umagic32(c).m)]))) (Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
 	for {
 		t := v.Type
@@ -556,7 +580,7 @@
 			break
 		}
 		c := auxIntToInt32(v_1.AuxInt)
-		if !(umagicOK32(c) && config.RegSize == 8) {
+		if !(umagicOK32(c) && config.RegSize == 8 && config.ctxt.Arch.Name == "wasm") {
 			break
 		}
 		v.reset(OpTrunc64to32)
diff --git a/test/codegen/divmod.go b/test/codegen/divmod.go
index adcba5d..05ee930 100644
--- a/test/codegen/divmod.go
+++ b/test/codegen/divmod.go
@@ -325,14 +325,29 @@
 	// 386: "SHRL [$]1,"
 	// 386: "MOVL [$]-1840700269,"
 	// 386: "SHRL [$]2,"
-	// arm64: "UBFX [$]1, R[0-9]+, [$]31,"
-	// arm64: "MOVD [$]2454267027,"
-	// arm64: "MUL"
-	// arm64: "LSR [$]34,"
+	// amd64: "MOVQ [$]1317624576808583168,"
+	// amd64: "MULQ"
+	// amd64: -"SHRQ"
+	// arm64: "MOVD [$]1317624576808583168,"
+	// arm64: "UMULH"
+	// arm64: -"UBFX"
+	// arm64: -"LSR"
 	// wasm: "I64Const [$]2454267027"
 	return i / 14
 }
 
+func div28_uint32(i uint32) uint32 {
+	// amd64: "MOVQ [$]658812288404291584,"
+	// amd64: "MULQ"
+	// amd64: -"SHRQ"
+	// arm64: "MOVD [$]658812288404291584,"
+	// arm64: "UMULH"
+	// arm64: -"UBFX"
+	// arm64: -"LSR"
+	// wasm: "I64Const [$]2454267027"
+	return i / 28
+}
+
 func div14_uint64(i uint64) uint64 {
 	// 386: "MOVL [$]-1840700270,"
 	// 386: "MULL"
@@ -364,12 +379,13 @@
 	// 386: "ADDL"
 	// 386: "RCRL [$]1,"
 	// 386: "SHRL [$]2,"
-	// arm64: "UBFIZ [$]32, R[0-9]+, [$]32,"
-	// arm64: "MOVD [$]613566757,"
-	// arm64: "MUL"
-	// arm64: "SUB"
-	// arm64: "ADD R[0-9]+>>1,"
-	// arm64: "LSR [$]34,"
+	// amd64: "MOVQ [$]2635249153617166336,"
+	// amd64: "MULQ"
+	// amd64: -"RCR"
+	// arm64: "MOVD [$]2635249153617166336,"
+	// arm64: "UMULH"
+	// arm64: -"UBFIZ"
+	// arm64: -"LSR"
 	// wasm: "I64Const [$]613566757"
 	return i / 7
 }

Change information

Files:

M src/cmd/compile/internal/ssa/_gen/divmod.rules
M src/cmd/compile/internal/ssa/magic.go
M src/cmd/compile/internal/ssa/rewritedivmod.go
M test/codegen/divmod.go

Change size: M

Delta: 4 files changed, 79 insertions(+), 13 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gopher Robot (Gerrit)

unread,

Jun 1, 2026, 11:20:09 PM (8 days ago) Jun 1

to shigeo MITSUNARI, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Message from Gopher Robot

Congratulations on opening your first change. Thank you for your contribution!

Next steps:
A maintainer will review your change and provide feedback. See
https://go.dev/doc/contribute#review for more info and tips to get your
patch through code review.

Most changes in the Go project go through a few rounds of revision. This can be
surprising to people new to the project. The careful, iterative review process
is our way of helping mentor contributors and ensuring that their contributions
have a lasting impact.

During May-July and Nov-Jan the Go project is in a code freeze, during which
little code gets reviewed or merged. If a reviewer responds with a comment like
R=go1.11 or adds a tag like "wait-release", it means that this CL will be
reviewed as part of the next development cycle. See https://go.dev/s/release
for more details.

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Alan Donovan (Gerrit)

unread,

Jun 2, 2026, 9:48:08 AM (8 days ago) Jun 2

to shigeo MITSUNARI, goph...@pubsubhelper.golang.org, Keith Randall, Martin Möhrmann, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Keith Randall, Martin Möhrmann and shigeo MITSUNARI

Alan Donovan added 1 comment

Commit Message

Line 23, Patchset 1 (Latest):Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):

Alan Donovan . unresolved

Where is the benchmark?

Open in Gerrit

Related details

Attention is currently required from:

Keith Randall
Martin Möhrmann
shigeo MITSUNARI

Submit Requirements:

Code-Review

No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

shigeo MITSUNARI (Gerrit)

unread,

Jun 9, 2026, 4:34:12 AM (yesterday) Jun 9

to goph...@pubsubhelper.golang.org, Alan Donovan, Keith Randall, Martin Möhrmann, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Alan Donovan, Keith Randall and Martin Möhrmann

shigeo MITSUNARI added 2 comments

Patchset-level comments

File-level comment, Patchset 1 (Latest):

shigeo MITSUNARI . unresolved

Sorry for the late reply. I had commented right away but it was left in draft state.

Commit Message

Line 23, Patchset 1 (Latest):Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):

Alan Donovan . unresolved

Where is the benchmark?

shigeo MITSUNARI

The benchmark I used is not part of this CL. Here it is for reference:

```go
package gobench

import (
  "os"
  "testing"
)

var sink uint32

// BenchmarkOdd: divisors
func BenchmarkOdd(b *testing.B) {
  ret := uint32(len(os.Args)) * 0x12345678
  for i := 0; i < b.N; i++ {
      x := uint32(i) ^ ret
      ret ^= x / 7
      ret ^= x / 19
      ret ^= x / 107
  }
  sink = ret
}

// BenchmarkEven: divisors 14, 38, 214
func BenchmarkEven(b *testing.B) {
  ret := uint32(len(os.Args)) * 0x12345678
  for i := 0; i < b.N; i++ {
      x := uint32(i) ^ ret
      ret ^= x / 14
      ret ^= x / 38
      ret ^= x / 214
  }
  sink = ret
}
```

Open in Gerrit

Related details

Attention is currently required from:

Alan Donovan
Keith Randall
Martin Möhrmann

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Keith Randall (Gerrit)

unread,

Jun 9, 2026, 11:56:48 AM (17 hours ago) Jun 9

to shigeo MITSUNARI, goph...@pubsubhelper.golang.org, Alan Donovan, Keith Randall, Martin Möhrmann, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Alan Donovan, Martin Möhrmann and shigeo MITSUNARI

Keith Randall added 1 comment

Commit Message

Keith Randall

We generally put the benchmark code in the CL, so we have that benchmark available for future improvements. Probably cmd/compile/internal/test/math_test.go would be the right place.

Open in Gerrit

Related details

Attention is currently required from:

Alan Donovan
Martin Möhrmann
shigeo MITSUNARI

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

shigeo MITSUNARI (Gerrit)

unread,

1:00 AM (4 hours ago) 1:00 AM

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Alan Donovan, Martin Möhrmann and shigeo MITSUNARI

shigeo MITSUNARI uploaded new patchset

shigeo MITSUNARI uploaded patch set #2 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Alan Donovan
Martin Möhrmann
shigeo MITSUNARI

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

shigeo MITSUNARI (Gerrit)

unread,

1:05 AM (4 hours ago) 1:05 AM

to goph...@pubsubhelper.golang.org, Alan Donovan, Keith Randall, Martin Möhrmann, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Alan Donovan, Keith Randall and Martin Möhrmann

shigeo MITSUNARI added 1 comment

Commit Message

Line 23, Patchset 1:Benchmark on Intel Xeon w9-3495X (3 divisions per iteration, go1.26.3):

shigeo MITSUNARI

Done. Added BenchmarkDiv32UnsignedConstOdd and BenchmarkDiv32UnsignedConstEven to cmd/compile/internal/test/math_test.go.

Open in Gerrit

Related details

Attention is currently required from:

Alan Donovan
Keith Randall
Martin Möhrmann

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Reply all

Reply to author

Forward

[go] cmd/compile: optimize 32-bit unsigned division by pre-shifted magic constant

shigeo MITSUNARI (Gerrit)

shigeo MITSUNARI has uploaded the change for review

Commit message

Change diff

Change information

Related details

Gopher Robot (Gerrit)

Message from Gopher Robot

Related details

Alan Donovan (Gerrit)

Alan Donovan added 1 comment

Related details

shigeo MITSUNARI (Gerrit)

shigeo MITSUNARI added 2 comments

Related details

Keith Randall (Gerrit)

Keith Randall added 1 comment

Related details

shigeo MITSUNARI (Gerrit)

shigeo MITSUNARI uploaded new patchset

Related details

shigeo MITSUNARI (Gerrit)

shigeo MITSUNARI added 1 comment

Related details