Brian Kessler has uploaded this change for review.
math/bits: add extended precision Add, Sub, Mul, Div
Port math/big pure go versions of add-with-carry, subtract-with-borrow,
full-width multiply and full-width divide. Add SSA rules to intrinsify
Mul/Mul64 (AMD64 and ARM64) and Div/Div64 (AMD64). SSA rules for other
functions and architectures are left as a future optimization. Benchmark
results on AMD64 before and after SSA implementation are below.
name old time/op new time/op delta
Add-4 1.80ns ± 2% 1.79ns ± 2% ~ (p=0.397 n=5+5)
Add32-4 1.71ns ± 1% 1.71ns ± 2% ~ (p=1.000 n=5+5)
Add64-4 1.86ns ±10% 1.78ns ± 1% ~ (p=0.143 n=5+5)
Sub-4 1.79ns ± 2% 1.78ns ± 0% ~ (p=0.413 n=5+4)
Sub32-4 1.78ns ± 0% 1.78ns ± 0% ~ (p=0.333 n=5+4)
Sub64-4 1.78ns ± 0% 1.78ns ± 0% ~ (all equal)
Mul-4 6.87ns ± 1% 1.78ns ± 1% -74.09% (p=0.008 n=5+5)
Mul32-4 1.40ns ± 4% 1.37ns ± 3% ~ (p=0.413 n=5+5)
Mul64-4 6.83ns ± 0% 1.78ns ± 0% -73.99% (p=0.000 n=5+4)
Div-4 54.6ns ± 0% 48.6ns ± 4% -10.95% (p=0.016 n=4+5)
Div32-4 18.1ns ± 1% 18.1ns ± 2% ~ (p=1.000 n=5+5)
Div64-4 54.7ns ± 0% 47.8ns ± 0% -12.65% (p=0.000 n=5+4)
Updates #24813
Change-Id: Ifae5d2f6ee4237137c9dcba931f69c91b80a4b1c
---
M src/cmd/compile/internal/gc/ssa.go
M src/math/bits/bits.go
M src/math/bits/bits_test.go
3 files changed, 572 insertions(+), 0 deletions(-)
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index ff2b93d..42aa4e2 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3325,6 +3325,26 @@
addF("math/bits", "OnesCount",
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64)
+ addF("math/bits", "Mul",
+ func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT], types.Types[TUINT]), args[0], args[1])
+ },
+ sys.AMD64, sys.ARM64)
+ addF("math/bits", "Mul64",
+ func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1])
+ },
+ sys.AMD64, sys.ARM64)
+ addF("math/bits", "Div",
+ func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT], types.Types[TUINT]), args[0], args[1], args[2])
+ },
+ sys.AMD64)
+ addF("math/bits", "Div64",
+ func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])
+ },
+ sys.AMD64)
/******** sync/atomic ********/
diff --git a/src/math/bits/bits.go b/src/math/bits/bits.go
index 989baac..f828842 100644
--- a/src/math/bits/bits.go
+++ b/src/math/bits/bits.go
@@ -328,3 +328,240 @@
}
return n + int(len8tab[x])
}
+
+// --- Add with carry ---
+
+// Add returns the sum with carry of x, y and carry, sum = x + y + carry.
+// The carry input is assumed to be 0 or 1; otherwise the behavior is undefined.
+// The carryOut output is guaranteed to be 0 or 1.
+func Add(x, y, carry uint) (sum, carryOut uint) {
+ yc := y + carry
+ sum = x + yc
+ if sum < x || yc < y {
+ carryOut = 1
+ }
+ return
+}
+
+// Add32 returns the sum with carry of x, y and carry, sum = x + y + carry.
+// The carry input is assumed to be 0 or 1; otherwise the behavior is undefined.
+// The carryOut output is guaranteed to be 0 or 1.
+func Add32(x, y, carry uint32) (sum, carryOut uint32) {
+ yc := y + carry
+ sum = x + yc
+ if sum < x || yc < y {
+ carryOut = 1
+ }
+ return
+}
+
+// Add64 returns the sum with carry of x, y and carry, sum = x + y + carry.
+// The carry input is assumed to be 0 or 1; otherwise the behavior is undefined.
+// The carryOut output is guaranteed to be 0 or 1.
+func Add64(x, y, carry uint64) (sum, carryOut uint64) {
+ yc := y + carry
+ sum = x + yc
+ if sum < x || yc < y {
+ carryOut = 1
+ }
+ return
+}
+
+// --- Subtract with borrow ---
+
+// Sub returns the difference of x, y and borrow, difference = x - y - borrow.
+// The borrow input is assumed to be 0 or 1; otherwise the behavior is undefined.
+// The borrowOut output is guaranteed to be 0 or 1.
+func Sub(x, y, borrow uint) (difference, borrowOut uint) {
+ yb := y + borrow
+ difference = x - yb
+ if difference > x || yb < y {
+ borrowOut = 1
+ }
+ return
+}
+
+// Sub32 returns the difference of x, y and borrow, difference = x - y - borrow.
+// The borrow input is assumed to be 0 or 1; otherwise the behavior is undefined.
+// The borrowOut output is guaranteed to be 0 or 1.
+func Sub32(x, y, borrow uint32) (difference, borrowOut uint32) {
+ yb := y + borrow
+ difference = x - yb
+ if difference > x || yb < y {
+ borrowOut = 1
+ }
+ return
+}
+
+// Sub64 returns the difference of x, y and borrow, difference = x - y - borrow.
+// The borrow input is assumed to be 0 or 1; otherwise the behavior is undefined.
+// The borrowOut output is guaranteed to be 0 or 1.
+func Sub64(x, y, borrow uint64) (difference, borrowOut uint64) {
+ yb := y + borrow
+ difference = x - yb
+ if difference > x || yb < y {
+ borrowOut = 1
+ }
+ return
+}
+
+// --- Full-width multiply ---
+
+// Mul returns the full-width product of x and y the bits of the upper half
+// are returned in hi and the bits of the lower half are returned in lo
+func Mul(x, y uint) (hi, lo uint) {
+ const (
+ halfUintSize = uintSize / 2
+ halfUintMask = (1 << halfUintSize) - 1
+ )
+ x0 := x & halfUintMask
+ x1 := x >> halfUintSize
+ y0 := y & halfUintMask
+ y1 := y >> halfUintSize
+ w0 := x0 * y0
+ t := x1*y0 + w0>>halfUintSize
+ w1 := t & halfUintMask
+ w2 := t >> halfUintSize
+ w1 += x0 * y1
+ hi = x1*y1 + w2 + w1>>halfUintSize
+ lo = x * y
+ return
+}
+
+// Mul32 returns the 64-bit product of x and y the upper 32-bits
+// are returned in hi and the lower 32-bits are returned in lo
+func Mul32(x, y uint32) (hi, lo uint32) {
+ tmp := uint64(x) * uint64(y)
+ hi, lo = uint32(tmp>>32), uint32(tmp)
+ return
+}
+
+// Mul64 returns the 128-bit product of x and y the upper 64-bits
+// are returned in hi and the lower 64-bits are returned in lo
+func Mul64(x, y uint64) (hi, lo uint64) {
+ const (
+ halfUintSize = 32
+ halfUintMask = (1 << halfUintSize) - 1
+ )
+ x0 := x & halfUintMask
+ x1 := x >> halfUintSize
+ y0 := y & halfUintMask
+ y1 := y >> halfUintSize
+ w0 := x0 * y0
+ t := x1*y0 + w0>>halfUintSize
+ w1 := t & halfUintMask
+ w2 := t >> halfUintSize
+ w1 += x0 * y1
+ hi = x1*y1 + w2 + w1>>halfUintSize
+ lo = x * y
+ return
+}
+
+// --- Full-width divide ---
+
+// Div returns the quotient and remainder of a double-word division of (hi, lo)
+// by x, where hi represents the high word of the input and lo the low word.
+// if hi >=x, the behavior is undefined because the quotient will not fit in a single word.
+func Div(hi, lo, x uint) (quo, rem uint) {
+ const (
+ halfUintSize = uintSize / 2
+ halfUintBase = (1 << halfUintSize)
+ halfUintMask = halfUintBase - 1
+ )
+
+ if hi >= x {
+ return 1<<UintSize - 1, 1<<UintSize - 1
+ }
+
+ s := uint(LeadingZeros(x))
+ x <<= s
+
+ xn1 := x >> halfUintSize
+ xn0 := x & halfUintMask
+ un32 := hi<<s | lo>>(UintSize-s)
+ un10 := lo << s
+ un1 := un10 >> halfUintSize
+ un0 := un10 & halfUintMask
+ q1 := un32 / xn1
+ rhat := un32 - q1*xn1
+
+ for q1 >= halfUintBase || q1*xn0 > halfUintBase*rhat+un1 {
+ q1--
+ rhat += xn1
+ if rhat >= halfUintBase {
+ break
+ }
+ }
+
+ un21 := un32*halfUintBase + un1 - q1*x
+ q0 := un21 / xn1
+ rhat = un21 - q0*xn1
+
+ for q0 >= halfUintBase || q0*xn0 > halfUintBase*rhat+un0 {
+ q0--
+ rhat += xn1
+ if rhat >= halfUintBase {
+ break
+ }
+ }
+
+ return q1*halfUintBase + q0, (un21*halfUintBase + un0 - q0*x) >> s
+}
+
+// Div32 returns the quotient and remainder of a 64-bit division of (hi, lo)
+// by x, where hi represents the high 32-bits of the input and lo the low 32-bits.
+// if hi >=x, the behavior is undefined because the quotient will not fit in 32-bits.
+func Div32(hi, lo, x uint32) (quo, rem uint32) {
+ z := uint64(hi)<<32 | uint64(lo)
+ quo, rem = uint32(z/uint64(x)), uint32(z%uint64(x))
+ return
+}
+
+// Div64 returns the quotient and remainder of a 128-bit division of (hi, lo)
+// by x, where hi represents the high 64-bits of the input and lo the low 64-bits.
+// if hi >=x, the behavior is undefined because the quotient will not fit in 64-bits.
+func Div64(hi, lo, x uint64) (quo, rem uint64) {
+ const (
+ halfUintSize = 32
+ halfUintBase = (1 << halfUintSize)
+ halfUintMask = halfUintBase - 1
+ )
+
+ if hi >= x {
+ return 1<<UintSize - 1, 1<<UintSize - 1
+ }
+
+ s := uint(LeadingZeros64(x))
+ x <<= s
+
+ xn1 := x >> halfUintSize
+ xn0 := x & halfUintMask
+ un32 := hi<<s | lo>>(UintSize-s)
+ un10 := lo << s
+ un1 := un10 >> halfUintSize
+ un0 := un10 & halfUintMask
+ q1 := un32 / xn1
+ rhat := un32 - q1*xn1
+
+ for q1 >= halfUintBase || q1*xn0 > halfUintBase*rhat+un1 {
+ q1--
+ rhat += xn1
+ if rhat >= halfUintBase {
+ break
+ }
+ }
+
+ un21 := un32*halfUintBase + un1 - q1*x
+ q0 := un21 / xn1
+ rhat = un21 - q0*xn1
+
+ for q0 >= halfUintBase || q0*xn0 > halfUintBase*rhat+un0 {
+ q0--
+ rhat += xn1
+ if rhat >= halfUintBase {
+ break
+ }
+ }
+
+ return q1*halfUintBase + q0, (un21*halfUintBase + un0 - q0*x) >> s
+}
diff --git a/src/math/bits/bits_test.go b/src/math/bits/bits_test.go
index 5c34f6d..33b3cb9 100644
--- a/src/math/bits/bits_test.go
+++ b/src/math/bits/bits_test.go
@@ -705,6 +705,321 @@
}
}
+type funUint func(x, y, c uint) (z, cout uint)
+type funUint32 func(x, y, c uint32) (z, cout uint32)
+type funUint64 func(x, y, c uint64) (z, cout uint64)
+type argUint struct {
+ x, y, c, cout, z uint
+}
+type argUint32 struct {
+ x, y, c, cout, z uint32
+}
+type argUint64 struct {
+ x, y, c, cout, z uint64
+}
+
+const (
+ _M = 1<<UintSize - 1
+ _M32 = 1<<32 - 1
+ _M64 = 1<<64 - 1
+)
+
+var sumUint = []argUint{
+ {0, 0, 0, 0, 0},
+ {0, 1, 0, 0, 1},
+ {0, 0, 1, 0, 1},
+ {0, 1, 1, 0, 2},
+ {12345, 67890, 0, 0, 80235},
+ {12345, 67890, 1, 0, 80236},
+ {_M, 1, 0, 1, 0},
+ {_M, 0, 1, 1, 0},
+ {_M, 1, 1, 1, 1},
+ {_M, _M, 0, 1, _M - 1},
+ {_M, _M, 1, 1, _M},
+}
+
+func testAddSubUint(t *testing.T, msg string, f funUint, a argUint) {
+ z, cout := f(a.x, a.y, a.c)
+ if z != a.z || cout != a.cout {
+ t.Errorf("%s%+v\n\tgot z:cout = %#x:%#x; want %#x:%#x", msg, a, z, cout, a.z, a.cout)
+ }
+}
+
+func TestAddSubUint(t *testing.T) {
+ for _, a := range sumUint {
+ arg := a
+ testAddSubUint(t, "Add", Add, arg)
+
+ arg = argUint{a.y, a.x, a.c, a.cout, a.z}
+ testAddSubUint(t, "Add symmetric", Add, arg)
+
+ arg = argUint{a.z, a.x, a.c, a.cout, a.y}
+ testAddSubUint(t, "Sub", Sub, arg)
+
+ arg = argUint{a.z, a.y, a.c, a.cout, a.x}
+ testAddSubUint(t, "Sub symmetric", Sub, arg)
+ }
+}
+
+var sumUint32 = []argUint32{
+ {0, 0, 0, 0, 0},
+ {0, 1, 0, 0, 1},
+ {0, 0, 1, 0, 1},
+ {0, 1, 1, 0, 2},
+ {12345, 67890, 0, 0, 80235},
+ {12345, 67890, 1, 0, 80236},
+ {_M32, 1, 0, 1, 0},
+ {_M32, 0, 1, 1, 0},
+ {_M32, 1, 1, 1, 1},
+ {_M32, _M32, 0, 1, _M32 - 1},
+ {_M32, _M32, 1, 1, _M32},
+}
+
+func testAddSubUint32(t *testing.T, msg string, f funUint32, a argUint32) {
+ z, cout := f(a.x, a.y, a.c)
+ if z != a.z || cout != a.cout {
+ t.Errorf("%s%+v\n\tgot z:cout = %#x:%#x; want %#x:%#x", msg, a, z, cout, a.z, a.cout)
+ }
+}
+
+func TestAddSubUint32(t *testing.T) {
+ for _, a := range sumUint32 {
+ arg := a
+ testAddSubUint32(t, "Add32", Add32, arg)
+
+ arg = argUint32{a.y, a.x, a.c, a.cout, a.z}
+ testAddSubUint32(t, "Add32 symmetric", Add32, arg)
+
+ arg = argUint32{a.z, a.x, a.c, a.cout, a.y}
+ testAddSubUint32(t, "Sub32", Sub32, arg)
+
+ arg = argUint32{a.z, a.y, a.c, a.cout, a.x}
+ testAddSubUint32(t, "Sub32 symmetric", Sub32, arg)
+ }
+}
+
+var sumUint64 = []argUint64{
+ {0, 0, 0, 0, 0},
+ {0, 1, 0, 0, 1},
+ {0, 0, 1, 0, 1},
+ {0, 1, 1, 0, 2},
+ {12345, 67890, 0, 0, 80235},
+ {12345, 67890, 1, 0, 80236},
+ {_M64, 1, 0, 1, 0},
+ {_M64, 0, 1, 1, 0},
+ {_M64, 1, 1, 1, 1},
+ {_M64, _M64, 0, 1, _M64 - 1},
+ {_M64, _M64, 1, 1, _M64},
+}
+
+func testAddSubUint64(t *testing.T, msg string, f funUint64, a argUint64) {
+ z, cout := f(a.x, a.y, a.c)
+ if z != a.z || cout != a.cout {
+ t.Errorf("%s%+v\n\tgot z:cout = %#x:%#x; want %#x:%#x", msg, a, z, cout, a.z, a.cout)
+ }
+}
+
+func TestAddSubUint64(t *testing.T) {
+ for _, a := range sumUint64 {
+ arg := a
+ testAddSubUint64(t, "Add64", Add64, arg)
+
+ arg = argUint64{a.y, a.x, a.c, a.cout, a.z}
+ testAddSubUint64(t, "Add64 symmetric", Add64, arg)
+
+ arg = argUint64{a.z, a.x, a.c, a.cout, a.y}
+ testAddSubUint64(t, "Sub64", Sub64, arg)
+
+ arg = argUint64{a.z, a.y, a.c, a.cout, a.x}
+ testAddSubUint64(t, "Sub64 symmetric", Sub64, arg)
+ }
+}
+
+var mulUintTests = []struct {
+ x, y uint
+ hi, lo, r uint
+}{
+ {1 << (UintSize - 1), 2, 1, 0, 1},
+ {_M, _M, _M - 1, 1, 42},
+}
+
+func TestMulDiv(t *testing.T) {
+ for i, test := range mulUintTests {
+ hi, lo := Mul(test.x, test.y)
+ if hi != test.hi || lo != test.lo {
+ t.Errorf("#%d %d * %d got (%x, %x) want (%x, %x)", i, test.x, test.y, hi, lo, test.hi, test.lo)
+ }
+ hi, lo = Mul(test.y, test.x)
+ if hi != test.hi || lo != test.lo {
+ t.Errorf("#%d %d * %d got (%x, %x) want (%x, %x)", i, test.y, test.x, hi, lo, test.hi, test.lo)
+ }
+ q, r := Div(test.hi, test.lo+test.r, test.x)
+ if q != test.y || r != test.r {
+ t.Errorf("#%d (%d,%d) / %d got (%x, %x) want (%x, %x)", i, test.hi, test.lo, test.x, q, r, test.y, test.r)
+ }
+ q, r = Div(test.hi, test.lo+test.r, test.y)
+ if q != test.x || r != test.r {
+ t.Errorf("#%d (%d,%d) / %d got (%x, %x) want (%x, %x)", i, test.hi, test.lo, test.y, q, r, test.x, test.r)
+ }
+ }
+}
+
+var mulUint32Tests = []struct {
+ x, y uint32
+ hi, lo, r uint32
+}{
+ {1 << 31, 2, 1, 0, 1},
+ {0xc47dfa8c, 50911, 0x98a4, 0x998587f4, 13},
+ {_M32, _M32, _M32 - 1, 1, 42},
+}
+
+func TestMulDiv32(t *testing.T) {
+ for i, test := range mulUint32Tests {
+ hi, lo := Mul32(test.x, test.y)
+ if hi != test.hi || lo != test.lo {
+ t.Errorf("#%d %d * %d got (%d, %d) want (%d, %d)", i, test.x, test.y, hi, lo, test.hi, test.lo)
+ }
+ hi, lo = Mul32(test.y, test.x)
+ if hi != test.hi || lo != test.lo {
+ t.Errorf("#%d %d * %d got (%d, %d) want (%d, %d)", i, test.y, test.x, hi, lo, test.hi, test.lo)
+ }
+ q, r := Div32(test.hi, test.lo+test.r, test.x)
+ if q != test.y || r != test.r {
+ t.Errorf("#%d (%d,%d) / %d got (%d, %d) want (%d, %d)", i, test.hi, test.lo, test.x, q, r, test.y, test.r)
+ }
+ q, r = Div32(test.hi, test.lo+test.r, test.y)
+ if q != test.x || r != test.r {
+ t.Errorf("#%d (%d,%d) / %d got (%d, %d) want (%d, %d)", i, test.hi, test.lo, test.y, q, r, test.x, test.r)
+ }
+ }
+}
+
+var mulUint64Tests = []struct {
+ x, y uint64
+ hi, lo, r uint64
+}{
+ {1 << 63, 2, 1, 0, 1},
+ {0x3626229738a3b9, 0xd8988a9f1cc4a61, 0x2dd0712657fe8, 0x9dd6a3364c358319, 13},
+ {_M64, _M64, _M64 - 1, 1, 42},
+}
+
+func TestMulDiv64(t *testing.T) {
+ for i, test := range mulUint64Tests {
+ hi, lo := Mul64(test.x, test.y)
+ if hi != test.hi || lo != test.lo {
+ t.Errorf("#%d %d * %d got (%x, %x) want (%x, %x)", i, test.x, test.y, hi, lo, test.hi, test.lo)
+ }
+ hi, lo = Mul64(test.y, test.x)
+ if hi != test.hi || lo != test.lo {
+ t.Errorf("#%d %d * %d got (%x, %x) want (%x, %x)", i, test.y, test.x, hi, lo, test.hi, test.lo)
+ }
+ q, r := Div64(test.hi, test.lo+test.r, test.x)
+ if q != test.y || r != test.r {
+ t.Errorf("#%d (%d,%d) / %d got (%x, %x) want (%x, %x)", i, test.hi, test.lo, test.x, q, r, test.y, test.r)
+ }
+ q, r = Div64(test.hi, test.lo+test.r, test.y)
+ if q != test.x || r != test.r {
+ t.Errorf("#%d (%d,%d) / %d got (%x, %x) want (%x, %x)", i, test.hi, test.lo, test.y, q, r, test.x, test.r)
+ }
+ }
+}
+
+func BenchmarkAdd(b *testing.B) {
+ var z, c uint
+ for i := 0; i < b.N; i++ {
+ z, c = Add(uint(Input), uint(i), c)
+ }
+ Output = int(z + c)
+}
+
+func BenchmarkAdd32(b *testing.B) {
+ var z, c uint32
+ for i := 0; i < b.N; i++ {
+ z, c = Add32(uint32(Input), uint32(i), c)
+ }
+ Output = int(z + c)
+}
+
+func BenchmarkAdd64(b *testing.B) {
+ var z, c uint64
+ for i := 0; i < b.N; i++ {
+ z, c = Add64(uint64(Input), uint64(i), c)
+ }
+ Output = int(z + c)
+}
+
+func BenchmarkSub(b *testing.B) {
+ var z, c uint
+ for i := 0; i < b.N; i++ {
+ z, c = Sub(uint(Input), uint(i), c)
+ }
+ Output = int(z + c)
+}
+
+func BenchmarkSub32(b *testing.B) {
+ var z, c uint32
+ for i := 0; i < b.N; i++ {
+ z, c = Sub32(uint32(Input), uint32(i), c)
+ }
+ Output = int(z + c)
+}
+
+func BenchmarkSub64(b *testing.B) {
+ var z, c uint64
+ for i := 0; i < b.N; i++ {
+ z, c = Add64(uint64(Input), uint64(i), c)
+ }
+ Output = int(z + c)
+}
+
+func BenchmarkMul(b *testing.B) {
+ var hi, lo uint
+ for i := 0; i < b.N; i++ {
+ hi, lo = Mul(uint(Input), uint(i))
+ }
+ Output = int(hi + lo)
+}
+
+func BenchmarkMul32(b *testing.B) {
+ var hi, lo uint32
+ for i := 0; i < b.N; i++ {
+ hi, lo = Mul32(uint32(Input), uint32(i))
+ }
+ Output = int(hi + lo)
+}
+
+func BenchmarkMul64(b *testing.B) {
+ var hi, lo uint64
+ for i := 0; i < b.N; i++ {
+ hi, lo = Mul64(uint64(Input), uint64(i))
+ }
+ Output = int(hi + lo)
+}
+
+func BenchmarkDiv(b *testing.B) {
+ var q, r uint
+ for i := 0; i < b.N; i++ {
+ q, r = Div(1, uint(i), uint(Input))
+ }
+ Output = int(q + r)
+}
+
+func BenchmarkDiv32(b *testing.B) {
+ var q, r uint32
+ for i := 0; i < b.N; i++ {
+ q, r = Div32(1, uint32(i), uint32(Input))
+ }
+ Output = int(q + r)
+}
+
+func BenchmarkDiv64(b *testing.B) {
+ var q, r uint64
+ for i := 0; i < b.N; i++ {
+ q, r = Div64(1, uint64(i), uint64(Input))
+ }
+ Output = int(q + r)
+}
+
// ----------------------------------------------------------------------------
// Testing support
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
4 comments:
Patch Set #1, Line 412: func Mul(x, y uint) (hi, lo uint) {
I see that including the Mul64 implementation here outperforms calling Mul64 after checking UintSize. However shouldn't the compiler ideally be able to optimize that away? The following:
if UintSize == 32 {
hi32, lo32 := Mul32(uint32(x), uint32(y))
return uint(hi32), uint(lo32)
}
hi64, lo64 := Mul64(uint64(x), uint64(y))
return uint(hi64), uint(lo64)should have the same performance as
if UintSize == 32 {
z := uint64(x) * uint64(y)
return uint(z >> 32), uint(z)
}
const (
halfUintSize = uintSize / 2
halfUintMask = (1 << halfUintSize) - 1
)
x0 := x & halfUintMask
x1 := x >> halfUintSize
y0 := y & halfUintMask
y1 := y >> halfUintSize
w0 := x0 * y0
t := x1*y0 + w0>>halfUintSize
w1 := t & halfUintMask
w2 := t >> halfUintSize
w1 += x0 * y1
hi = x1*y1 + w2 + w1>>halfUintSize
lo = x * y
return
Patch Set #1, Line 522: // if hi >=x, the behavior is undefined because the quotient will not fit in 64-bits.
I think we can be more clear on exactly the kind of division performed. Maybe something like the following?
// Div64 returns the quotient and remainder of a (128 / 64 -> 64)
// division of (hi || lo) and x, where hi and lo hold the most
// significant and least significant 64 bits of the dividend.
// If hi >= x, the behavior is undefined because the quotient will
// not fit into 64 bits.
Patch Set #1, Line 539: un32 := hi<<s | lo>>(UintSize-s)
As per Hacker's Delight. To make this division work "on machines that have mod 32 shifts (e.g., Intel x86)", we can AND
(lo >> (UintSize-s)) & uint64(-s>>31)
rewriting the above as
s := LeadingZeros64(x)
us := uint64(s)
un64 := (hi << us) | ((lo >> (UintSize - us)) & uint64(-s>>31))
Not entirely sure the effect this will have until it's tested on x86.
File src/math/bits/bits_test.go:
Patch Set #1, Line 838: var mulUintTests = []struct {
Perhaps there should be more multiplication and division test cases? I use code generation in https://github.com/smasher164/extprec/blob/74b3d0d41a013d3f2d010c9dcdee8b3e4eb7bbc9/make_factors.go to produce a factors table (https://github.com/smasher164/extprec/blob/74b3d0d41a013d3f2d010c9dcdee8b3e4eb7bbc9/factors.go) similar to this slice of structs you have here.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
This CL is missing test/codegen/* tests to make sure that intrinsification keeps working.
1 comment:
Add SSA rules to intrinsify
Mul/Mul64 (AMD64 and ARM64) and Div/Div64 (AMD64).
please separate the SSA change out in an extra CL scoped to cmd/compile. Then that can reference the speedup to the pure go version.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Will update the initial CL and split the intrinsification into a a separate follow-up CL.
4 comments:
Add SSA rules to intrinsify
Mul/Mul64 (AMD64 and ARM64) and Div/Div64 (AMD64).
please separate the SSA change out in an extra CL scoped to cmd/compile. […]
Sure, I'll split this into two CL's
Patch Set #1, Line 412: func Mul(x, y uint) (hi, lo uint) {
I see that including the Mul64 implementation here outperforms calling Mul64 after checking UintSize […]
Yes, it's probably better to eliminate the code duplication between the sized and unsized versions. Also, these SW fallbacks will mostly be avoided when appropriate intrinsics are used.
Patch Set #1, Line 539: un32 := hi<<s | lo>>(UintSize-s)
As per Hacker's Delight. To make this division work "on machines that have mod 32 shifts (e.g. […]
This was a mistake when porting over the math/big implementation with uint to uint64, the UintSize should be replaced with 64 here. I will correct this.
File src/math/bits/bits_test.go:
Patch Set #1, Line 838: var mulUintTests = []struct {
Perhaps there should be more multiplication and division test cases? I use code generation in https: […]
I just ported over the test cases from math/big. If the test cases involve additional cases, e.g. different carry behavior etc. then it may be worthwhile to add them, but I'm not sure its worth adding extra test cases that aren't for specific issues.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Brian Kessler uploaded patch set #2 to this change.
math/bits: add extended precision Add, Sub, Mul, Div
Port math/big pure go versions of add-with-carry, subtract-with-borrow,
full-width multiply, and full-width divide.
Updates #24813
Change-Id: Ifae5d2f6ee4237137c9dcba931f69c91b80a4b1c
---
M src/math/bits/bits.go
M src/math/bits/bits_test.go
2 files changed, 500 insertions(+), 0 deletions(-)
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Love this and really really looking forward to seeing this merged!
Patch set 2:Code-Review +1
1 comment:
Patch Set #1, Line 838: var mulUintTests = []struct {
I just ported over the test cases from math/big. If the test cases involve additional cases, e.g. […]
Are you positive that the current
tests will provide good coverage even for future assembly optimizations of these functions on different platforms? My gut feeling is that some more stress testing can still be valuable. They don’t have to go in now though, they can be added as a followup CL.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Thanks for doing this. Some preliminary comments on the doc strings.
I haven't looked at the testing code yet.
8 comments:
Patch Set #2, Line 13: const
please move this into the multiply section - that's where it's used
nitpick: s/carry, sum =/carry: sum =/
A colon seems warranted here.
Apply everywhere.
Patch Set #2, Line 340: is assumed to
s/is assumed to/must/
Apply everywhere.
Patch Set #2, Line 378: undefined
s/ undefined/undefined/
(extra blank).
Fix everywhere.
Patch Set #2, Line 380: difference
s/difference/diff/ (?) (and fix the comment accordingly).
Below you also use quo, rem, not quotient and remainder. Using the shortcut seems ok to me. It's still quite clear what is meant.
Patch Set #2, Line 415: // Mul returns the full-width product of x and y the bits of the upper half
There are some grammar issues in this comment. Also, it doesn't follow the previously established pattern for Add and Sub. How about:
Mul returns the full-width product of x and y: (hi, lo) = x * y
with the product bits' upper half returned in hi and the lower
half returned in lo.
Apply everywhere.
Similarly, to match the existing pattern:
Div returns the quotient and remainder of (hi, lo) divided by y:
quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
half in parameter hi and the lower half in parameter lo.
hi must be < y otherwise the behavior is undefined (the quotient
won't fit into quo).
Apply everywhere.
s/x/y/
(I agree with a prior comment on this).
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Brian Kessler uploaded patch set #3 to this change.
math/bits: add extended precision Add, Sub, Mul, Div
Port math/big pure go versions of add-with-carry, subtract-with-borrow,
full-width multiply, and full-width divide.
Updates #24813
Change-Id: Ifae5d2f6ee4237137c9dcba931f69c91b80a4b1c
---
M src/math/bits/bits.go
M src/math/bits/bits_test.go
2 files changed, 509 insertions(+), 0 deletions(-)
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Updated the doc strings per your comments.
7 comments:
please move this into the multiply section - that's where it's used
Done
nitpick: s/carry, sum =/carry: sum =/ […]
Done
s/is assumed to/must/ […]
Done
s/ undefined/undefined/ […]
Done
s/difference/diff/ (?) (and fix the comment accordingly). […]
Done
Similarly, to match the existing pattern: […]
Done
s/x/y/ […]
Done
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
LGTM
but I'd write the tests a bit differently (more compactly). Leaving up to you.
3 comments:
File src/math/bits/bits_test.go:
I'd make the order here match the function signature:
x, y, c, z, cout uint
but see below.
Patch Set #3, Line 748: func TestAddSubUint(t *testing.T) {
I would put everything into this function and dispense with the various external types. It also matches the existing tests more closely which are also self-contained, by and large.
func TestAddSubUint_(t *testing.T) {
test := func(msg string, f func(x, y, c uint) (z, cout uint), x, y, c, cout, z uint) {
z1, cout1 := f(x, y, c)
if z1 != z || cout1 != cout {
t.Errorf("%s: got z:cout = %#x:%#x; want %#x:%#x", msg, z1, cout1, z, cout)
}
} for _, a := range []struct{ x, y, c, cout, z uint }{
{0, 0, 0, 0, 0},
{0, 1, 0, 0, 1},
{0, 0, 1, 0, 1},
{0, 1, 1, 0, 2},
{12345, 67890, 0, 0, 80235},
{12345, 67890, 1, 0, 80236},
{_M, 1, 0, 1, 0},
{_M, 0, 1, 1, 0},
{_M, 1, 1, 1, 1},
{_M, _M, 0, 1, _M - 1},
{_M, _M, 1, 1, _M},
} {
test("Add", Add, a.x, a.y, a.c, a.cout, a.z)
test("Add symmetric", Add, a.y, a.x, a.c, a.cout, a.z)
test("Sub", Sub, a.z, a.x, a.c, a.cout, a.y)
test("Sub symmetric", Sub, a.z, a.y, a.c, a.cout, a.x)
}
}If you pass the arguments explicitly, then you don't need to pack and unpack them in the struct. See my example above.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
A question about the undefined behavior when the quotient will not fit into a single word.
3 comments:
Patch Set #3, Line 455: // hi must be < y otherwise the behavior is undefined (the quotient
In the follow-up CL that implements the DIVQ intrinsic, a question came up about whether this behavior should be fully undefined or not. In particular, the pure go implementation handles 32-bit and 64-bit differently here. 64-bit division returns the signalling value (1<<64 - 1, 1<<64 - 1), while 32-bit division simply truncates via a uint64 to uint32 cast. The DIVQ 64-bit instruction on amd64 will cause a divide by zero exception for y < hi. I'm not sure whether all of those behaviors should be allowed or if this should be defined more narrowly, e.g. always panic or always return some value.
File src/math/bits/bits_test.go:
I'd make the order here match the function signature: […]
Done
Patch Set #3, Line 748: func TestAddSubUint(t *testing.T) {
I would put everything into this function and dispense with the various external types. […]
Yeah, that makes the tests much cleaner and more compact. I will update them.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Brian Kessler uploaded patch set #4 to this change.
math/bits: add extended precision Add, Sub, Mul, Div
Port math/big pure go versions of add-with-carry, subtract-with-borrow,
full-width multiply, and full-width divide.
Updates #24813
Change-Id: Ifae5d2f6ee4237137c9dcba931f69c91b80a4b1c
---
M src/math/bits/bits.go
M src/math/bits/bits_test.go
2 files changed, 460 insertions(+), 0 deletions(-)
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
Patch set 4:Run-TryBot +1Code-Review +2
1 comment:
Patch Set #3, Line 455: // hi must be < y otherwise the behavior is undefined (the quotient
In the follow-up CL that implements the DIVQ intrinsic, a question came up about whether this behavi […]
What you have here is ok as long as the intrinsified version behaves identical.
But you're probably right that even if this implementation and the intrisified version of this code have undefined behavior for some cases, it probably should be the same behavior if only to avoid strange errors (because people rely on these things even though they are not supposed to).
I'm ok with submitting this in a first step, though. We can harmonize the two implementations in a 2nd round.
To view, visit change 123157. To unsubscribe, or for help writing mail filters, visit settings.
TryBots are happy.
Patch set 4:TryBot-Result +1
Robert Griesemer merged this change.
math/bits: add extended precision Add, Sub, Mul, Div
Port math/big pure go versions of add-with-carry, subtract-with-borrow,
full-width multiply, and full-width divide.
Updates #24813
Change-Id: Ifae5d2f6ee4237137c9dcba931f69c91b80a4b1c
Reviewed-on: https://go-review.googlesource.com/123157
Reviewed-by: Robert Griesemer <g...@golang.org>
Run-TryBot: Robert Griesemer <g...@golang.org>
TryBot-Result: Gobot Gobot <go...@golang.org>
---
M src/math/bits/bits.go
M src/math/bits/bits_test.go
2 files changed, 460 insertions(+), 0 deletions(-)
diff --git a/src/math/bits/bits.go b/src/math/bits/bits.go
index 989baac..58cf52d 100644
--- a/src/math/bits/bits.go
+++ b/src/math/bits/bits.go
@@ -328,3 +328,197 @@
}
return n + int(len8tab[x])
}
+
+// --- Add with carry ---
+
+// Add returns the sum with carry of x, y and carry: sum = x + y + carry.
+// The carry input must be 0 or 1; otherwise the behavior is undefined.
+// The carryOut output is guaranteed to be 0 or 1.
+func Add(x, y, carry uint) (sum, carryOut uint) {
+ yc := y + carry
+ sum = x + yc
+ if sum < x || yc < y {
+ carryOut = 1
+ }
+ return
+}
+
+// Add32 returns the sum with carry of x, y and carry: sum = x + y + carry.
+// The carry input must be 0 or 1; otherwise the behavior is undefined.
+// The carryOut output is guaranteed to be 0 or 1.
+func Add32(x, y, carry uint32) (sum, carryOut uint32) {
+ yc := y + carry
+ sum = x + yc
+ if sum < x || yc < y {
+ carryOut = 1
+ }
+ return
+}
+
+// Add64 returns the sum with carry of x, y and carry: sum = x + y + carry.
+// The carry input must be 0 or 1; otherwise the behavior is undefined.
+// The carryOut output is guaranteed to be 0 or 1.
+func Add64(x, y, carry uint64) (sum, carryOut uint64) {
+ yc := y + carry
+ sum = x + yc
+ if sum < x || yc < y {
+ carryOut = 1
+ }
+ return
+}
+
+// --- Subtract with borrow ---
+
+// Sub returns the difference of x, y and borrow: diff = x - y - borrow.
+// The borrow input must be 0 or 1; otherwise the behavior is undefined.
+// The borrowOut output is guaranteed to be 0 or 1.
+func Sub(x, y, borrow uint) (diff, borrowOut uint) {
+ yb := y + borrow
+ diff = x - yb
+ if diff > x || yb < y {
+ borrowOut = 1
+ }
+ return
+}
+
+// Sub32 returns the difference of x, y and borrow, diff = x - y - borrow.
+// The borrow input must be 0 or 1; otherwise the behavior is undefined.
+// The borrowOut output is guaranteed to be 0 or 1.
+func Sub32(x, y, borrow uint32) (diff, borrowOut uint32) {
+ yb := y + borrow
+ diff = x - yb
+ if diff > x || yb < y {
+ borrowOut = 1
+ }
+ return
+}
+
+// Sub64 returns the difference of x, y and borrow: diff = x - y - borrow.
+// The borrow input must be 0 or 1; otherwise the behavior is undefined.
+// The borrowOut output is guaranteed to be 0 or 1.
+func Sub64(x, y, borrow uint64) (diff, borrowOut uint64) {
+ yb := y + borrow
+ diff = x - yb
+ if diff > x || yb < y {
+ borrowOut = 1
+ }
+ return
+}
+
+// --- Full-width multiply ---
+
+// Mul returns the full-width product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+func Mul(x, y uint) (hi, lo uint) {
+ if UintSize == 32 {
+ h, l := Mul32(uint32(x), uint32(y))
+ return uint(h), uint(l)
+ }
+ h, l := Mul64(uint64(x), uint64(y))
+ return uint(h), uint(l)
+}
+
+// Mul32 returns the 64-bit product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+func Mul32(x, y uint32) (hi, lo uint32) {
+ tmp := uint64(x) * uint64(y)
+ hi, lo = uint32(tmp>>32), uint32(tmp)
+ return
+}
+
+// Mul64 returns the 128-bit product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+func Mul64(x, y uint64) (hi, lo uint64) {
+ const mask32 = 1<<32 - 1
+ x0 := x & mask32
+ x1 := x >> 32
+ y0 := y & mask32
+ y1 := y >> 32
+ w0 := x0 * y0
+ t := x1*y0 + w0>>32
+ w1 := t & mask32
+ w2 := t >> 32
+ w1 += x0 * y1
+ hi = x1*y1 + w2 + w1>>32
+ lo = x * y
+ return
+}
+
+// --- Full-width divide ---
+
+// Div returns the quotient and remainder of (hi, lo) divided by y:
+// quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
+// half in parameter hi and the lower half in parameter lo.
+// hi must be < y otherwise the behavior is undefined (the quotient
+// won't fit into quo).
+func Div(hi, lo, y uint) (quo, rem uint) {
+ if UintSize == 32 {
+ q, r := Div32(uint32(hi), uint32(lo), uint32(y))
+ return uint(q), uint(r)
+ }
+ q, r := Div64(uint64(hi), uint64(lo), uint64(y))
+ return uint(q), uint(r)
+}
+
+// Div32 returns the quotient and remainder of (hi, lo) divided by y:
+// quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
+// half in parameter hi and the lower half in parameter lo.
+// hi must be < y otherwise the behavior is undefined (the quotient
+// won't fit into quo).
+func Div32(hi, lo, y uint32) (quo, rem uint32) {
+ z := uint64(hi)<<32 | uint64(lo)
+ quo, rem = uint32(z/uint64(y)), uint32(z%uint64(y))
+ return
+}
+
+// Div64 returns the quotient and remainder of (hi, lo) divided by y:
+// quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
+// half in parameter hi and the lower half in parameter lo.
+// hi must be < y otherwise the behavior is undefined (the quotient
+// won't fit into quo).
+func Div64(hi, lo, y uint64) (quo, rem uint64) {
+ const (
+ two32 = 1 << 32
+ mask32 = two32 - 1
+ )
+ if hi >= y {
+ return 1<<64 - 1, 1<<64 - 1
+ }
+
+ s := uint(LeadingZeros64(y))
+ y <<= s
+
+ yn1 := y >> 32
+ yn0 := y & mask32
+ un32 := hi<<s | lo>>(64-s)
+ un10 := lo << s
+ un1 := un10 >> 32
+ un0 := un10 & mask32
+ q1 := un32 / yn1
+ rhat := un32 - q1*yn1
+
+ for q1 >= two32 || q1*yn0 > two32*rhat+un1 {
+ q1--
+ rhat += yn1
+ if rhat >= two32 {
+ break
+ }
+ }
+
+ un21 := un32*two32 + un1 - q1*y
+ q0 := un21 / yn1
+ rhat = un21 - q0*yn1
+
+ for q0 >= two32 || q0*yn0 > two32*rhat+un0 {
+ q0--
+ rhat += yn1
+ if rhat >= two32 {
+ break
+ }
+ }
+
+ return q1*two32 + q0, (un21*two32 + un0 - q0*y) >> s
+}
diff --git a/src/math/bits/bits_test.go b/src/math/bits/bits_test.go
index 5c34f6d..bd6b618 100644
--- a/src/math/bits/bits_test.go
+++ b/src/math/bits/bits_test.go
@@ -705,6 +705,272 @@
}
}
+const (
+ _M = 1<<UintSize - 1
+ _M32 = 1<<32 - 1
+ _M64 = 1<<64 - 1
+)
+
+func TestAddSubUint(t *testing.T) {
+ test := func(msg string, f func(x, y, c uint) (z, cout uint), x, y, c, z, cout uint) {
+ z1, cout1 := f(x, y, c)
+ if z1 != z || cout1 != cout {
+ t.Errorf("%s: got z:cout = %#x:%#x; want %#x:%#x", msg, z1, cout1, z, cout)
+ }
+ }
+ for _, a := range []struct{ x, y, c, z, cout uint }{
+ {0, 0, 0, 0, 0},
+ {0, 1, 0, 1, 0},
+ {0, 0, 1, 1, 0},
+ {0, 1, 1, 2, 0},
+ {12345, 67890, 0, 80235, 0},
+ {12345, 67890, 1, 80236, 0},
+ {_M, 1, 0, 0, 1},
+ {_M, 0, 1, 0, 1},
+ {_M, 1, 1, 1, 1},
+ {_M, _M, 0, _M - 1, 1},
+ {_M, _M, 1, _M, 1},
+ } {
+ test("Add", Add, a.x, a.y, a.c, a.z, a.cout)
+ test("Add symmetric", Add, a.y, a.x, a.c, a.z, a.cout)
+ test("Sub", Sub, a.z, a.x, a.c, a.y, a.cout)
+ test("Sub symmetric", Sub, a.z, a.y, a.c, a.x, a.cout)
+ }
+}
+
+func TestAddSubUint32(t *testing.T) {
+ test := func(msg string, f func(x, y, c uint32) (z, cout uint32), x, y, c, z, cout uint32) {
+ z1, cout1 := f(x, y, c)
+ if z1 != z || cout1 != cout {
+ t.Errorf("%s: got z:cout = %#x:%#x; want %#x:%#x", msg, z1, cout1, z, cout)
+ }
+ }
+ for _, a := range []struct{ x, y, c, z, cout uint32 }{
+ {0, 0, 0, 0, 0},
+ {0, 1, 0, 1, 0},
+ {0, 0, 1, 1, 0},
+ {0, 1, 1, 2, 0},
+ {12345, 67890, 0, 80235, 0},
+ {12345, 67890, 1, 80236, 0},
+ {_M32, 1, 0, 0, 1},
+ {_M32, 0, 1, 0, 1},
+ {_M32, 1, 1, 1, 1},
+ {_M32, _M32, 0, _M32 - 1, 1},
+ {_M32, _M32, 1, _M32, 1},
+ } {
+ test("Add32", Add32, a.x, a.y, a.c, a.z, a.cout)
+ test("Add32 symmetric", Add32, a.y, a.x, a.c, a.z, a.cout)
+ test("Sub32", Sub32, a.z, a.x, a.c, a.y, a.cout)
+ test("Sub32 symmetric", Sub32, a.z, a.y, a.c, a.x, a.cout)
+ }
+}
+
+func TestAddSubUint64(t *testing.T) {
+ test := func(msg string, f func(x, y, c uint64) (z, cout uint64), x, y, c, z, cout uint64) {
+ z1, cout1 := f(x, y, c)
+ if z1 != z || cout1 != cout {
+ t.Errorf("%s: got z:cout = %#x:%#x; want %#x:%#x", msg, z1, cout1, z, cout)
+ }
+ }
+ for _, a := range []struct{ x, y, c, z, cout uint64 }{
+ {0, 0, 0, 0, 0},
+ {0, 1, 0, 1, 0},
+ {0, 0, 1, 1, 0},
+ {0, 1, 1, 2, 0},
+ {12345, 67890, 0, 80235, 0},
+ {12345, 67890, 1, 80236, 0},
+ {_M64, 1, 0, 0, 1},
+ {_M64, 0, 1, 0, 1},
+ {_M64, 1, 1, 1, 1},
+ {_M64, _M64, 0, _M64 - 1, 1},
+ {_M64, _M64, 1, _M64, 1},
+ } {
+ test("Add64", Add64, a.x, a.y, a.c, a.z, a.cout)
+ test("Add64 symmetric", Add64, a.y, a.x, a.c, a.z, a.cout)
+ test("Sub64", Sub64, a.z, a.x, a.c, a.y, a.cout)
+ test("Sub64 symmetric", Sub64, a.z, a.y, a.c, a.x, a.cout)
+ }
+}
+
+func TestMulDiv(t *testing.T) {
+ testMul := func(msg string, f func(x, y uint) (hi, lo uint), x, y, hi, lo uint) {
+ hi1, lo1 := f(x, y)
+ if hi1 != hi || lo1 != lo {
+ t.Errorf("%s: got hi:lo = %#x:%#x; want %#x:%#x", msg, hi1, lo1, hi, lo)
+ }
+ }
+ testDiv := func(msg string, f func(hi, lo, y uint) (q, r uint), hi, lo, y, q, r uint) {
+ q1, r1 := f(hi, lo, y)
+ if q1 != q || r1 != r {
+ t.Errorf("%s: got q:r = %#x:%#x; want %#x:%#x", msg, q1, r1, q, r)
+ }
+ }
+ for _, a := range []struct {
+ x, y uint
+ hi, lo, r uint
+ }{
+ {1 << (UintSize - 1), 2, 1, 0, 1},
+ {_M, _M, _M - 1, 1, 42},
+ } {
+ testMul("Mul", Mul, a.x, a.y, a.hi, a.lo)
+ testMul("Mul symmetric", Mul, a.y, a.x, a.hi, a.lo)
+ testDiv("Div", Div, a.hi, a.lo+a.r, a.y, a.x, a.r)
+ testDiv("Div symmetric", Div, a.hi, a.lo+a.r, a.x, a.y, a.r)
+ }
+}
+
+func TestMulDiv32(t *testing.T) {
+ testMul := func(msg string, f func(x, y uint32) (hi, lo uint32), x, y, hi, lo uint32) {
+ hi1, lo1 := f(x, y)
+ if hi1 != hi || lo1 != lo {
+ t.Errorf("%s: got hi:lo = %#x:%#x; want %#x:%#x", msg, hi1, lo1, hi, lo)
+ }
+ }
+ testDiv := func(msg string, f func(hi, lo, y uint32) (q, r uint32), hi, lo, y, q, r uint32) {
+ q1, r1 := f(hi, lo, y)
+ if q1 != q || r1 != r {
+ t.Errorf("%s: got q:r = %#x:%#x; want %#x:%#x", msg, q1, r1, q, r)
+ }
+ }
+ for _, a := range []struct {
+ x, y uint32
+ hi, lo, r uint32
+ }{
+ {1 << 31, 2, 1, 0, 1},
+ {0xc47dfa8c, 50911, 0x98a4, 0x998587f4, 13},
+ {_M32, _M32, _M32 - 1, 1, 42},
+ } {
+ testMul("Mul32", Mul32, a.x, a.y, a.hi, a.lo)
+ testMul("Mul32 symmetric", Mul32, a.y, a.x, a.hi, a.lo)
+ testDiv("Div32", Div32, a.hi, a.lo+a.r, a.y, a.x, a.r)
+ testDiv("Div32 symmetric", Div32, a.hi, a.lo+a.r, a.x, a.y, a.r)
+ }
+}
+
+func TestMulDiv64(t *testing.T) {
+ testMul := func(msg string, f func(x, y uint64) (hi, lo uint64), x, y, hi, lo uint64) {
+ hi1, lo1 := f(x, y)
+ if hi1 != hi || lo1 != lo {
+ t.Errorf("%s: got hi:lo = %#x:%#x; want %#x:%#x", msg, hi1, lo1, hi, lo)
+ }
+ }
+ testDiv := func(msg string, f func(hi, lo, y uint64) (q, r uint64), hi, lo, y, q, r uint64) {
+ q1, r1 := f(hi, lo, y)
+ if q1 != q || r1 != r {
+ t.Errorf("%s: got q:r = %#x:%#x; want %#x:%#x", msg, q1, r1, q, r)
+ }
+ }
+ for _, a := range []struct {
+ x, y uint64
+ hi, lo, r uint64
+ }{
+ {1 << 63, 2, 1, 0, 1},
+ {0x3626229738a3b9, 0xd8988a9f1cc4a61, 0x2dd0712657fe8, 0x9dd6a3364c358319, 13},
+ {_M64, _M64, _M64 - 1, 1, 42},
+ } {
+ testMul("Mul64", Mul64, a.x, a.y, a.hi, a.lo)
+ testMul("Mul64 symmetric", Mul64, a.y, a.x, a.hi, a.lo)
+ testDiv("Div64", Div64, a.hi, a.lo+a.r, a.y, a.x, a.r)
+ testDiv("Div64 symmetric", Div64, a.hi, a.lo+a.r, a.x, a.y, a.r)
RELNOTE=yes