[go] strconv: optimize and simplify dragonbox implementation

4 views

Skip to first unread message

Kevin Burke (Gerrit)

unread,

Dec 20, 2025, 2:25:17 AM (2 days ago) Dec 20

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Kevin Burke has uploaded the change for review

Commit message

strconv: optimize and simplify dragonbox implementation

Replace umul192Upper128 and umul192Lower128 helper functions with
the canonical umul192 function from math.go. This simplifies both
dboxMulPow64 and dboxParity64 and eliminates duplicate multiplication
logic. The specialized helpers and two additional unused functions
(uadd128, umul128Upper64) can be removed.

Eliminate duplicate mantissa bit constants floatMantBits64 and
floatMantBits32 in favor of the existing float64MantBits and
float32MantBits constants. Optimize hot path computations in
dboxFtoa64 and dboxFtoa32 by hoisting the mant2 calculation. Instead
of computing mant*2 three times per call, compute it once and reuse
it.

Eliminate an unnecessary dboxFtoa function call since bitSize is
already known at the call site. Add additional test coverage for
umul192.

Benchmarks performed on a quiet MacBook Pro with M1:

name                               old time/op    new time/op    delta
FormatFloat/Decimal-10               29.8ns ± 1%    26.8ns ± 0%  -10.09%  (p=0.000 n=9+8)
FormatFloat/Float-10                 41.6ns ± 0%    37.0ns ± 1%  -10.95%  (p=0.000 n=9+9)
FormatFloat/Exp-10                   34.0ns ± 2%    30.3ns ± 2%  -10.95%  (p=0.000 n=10+9)
FormatFloat/NegExp-10                33.5ns ± 1%    30.0ns ± 1%  -10.57%  (p=0.000 n=10+9)
FormatFloat/LongExp-10               39.9ns ± 0%    36.1ns ± 1%   -9.35%  (p=0.000 n=8+9)
FormatFloat/Big-10                   41.3ns ± 0%    34.6ns ± 0%  -16.27%  (p=0.000 n=8+9)
FormatFloat/BinaryExp-10             28.4ns ± 0%    26.2ns ± 1%   -7.60%  (p=0.000 n=8+10)
FormatFloat/32Integer-10             28.0ns ± 5%    26.7ns ± 0%   -4.85%  (p=0.000 n=10+10)
FormatFloat/32ExactFraction-10       33.2ns ± 0%    34.7ns ± 4%   +4.64%  (p=0.000 n=8+10)
FormatFloat/32Point-10               36.4ns ± 1%    38.7ns ± 0%   +6.18%  (p=0.000 n=9+8)
FormatFloat/32Exp-10                 29.8ns ± 0%    31.5ns ± 6%   +5.61%  (p=0.022 n=10+10)
FormatFloat/32NegExp-10              29.7ns ± 0%    29.5ns ± 1%   -0.77%  (p=0.001 n=9+10)
FormatFloat/32Shortest-10            31.8ns ± 1%    31.8ns ± 3%     ~     (p=0.118 n=9+10)
FormatFloat/Slowpath64-10            40.6ns ± 0%    36.4ns ± 1%  -10.36%  (p=0.000 n=10+10)
FormatFloat/SlowpathDenormal64-10    38.9ns ± 2%    35.0ns ± 0%  -10.19%  (p=0.000 n=10+8)

Change-Id: I649353f7d9cadeeabb33a7edce912ef2fa867075

Change diff

diff --git a/src/internal/strconv/ftoa.go b/src/internal/strconv/ftoa.go
index c8c98c1..027bc2c 100644
--- a/src/internal/strconv/ftoa.go
+++ b/src/internal/strconv/ftoa.go
@@ -135,7 +135,11 @@
 		// Use the Dragonbox algorithm.
 		var buf [32]byte
 		digs.d = buf[:]
-		dboxFtoa(&digs, mant, exp-int(flt.mantbits), denorm, bitSize)
+		if bitSize == 32 {
+			dboxFtoa32(&digs, uint32(mant), exp-int(flt.mantbits), denorm)
+		} else {
+			dboxFtoa64(&digs, mant, exp-int(flt.mantbits), denorm)
+		}
 		// Precision for shortest representation mode.
 		switch fmt {
 		case 'e', 'E':
diff --git a/src/internal/strconv/ftoadbox.go b/src/internal/strconv/ftoadbox.go
index bdd9cd1..018984f 100644
--- a/src/internal/strconv/ftoadbox.go
+++ b/src/internal/strconv/ftoadbox.go
@@ -19,17 +19,9 @@
 // The reference implementation in C++ by Junekey Jeon can be found at:
 // https://github.com/jk-jeon/dragonbox/blob/6c7c925b571d54486b9ffae8d9d18a822801cbda/subproject/simple/include/simple_dragonbox.h
 
-// dragonboxFtoa computes the decimal significand and exponent
-// from the binary significand and exponent using the Dragonbox algorithm
+// dboxFtoa64 computes the decimal significand and exponent
+// from the binary significand and exponent using the Dragonbox algorithm,
 // and formats the decimal floating point number in d.
-func dboxFtoa(d *decimalSlice, mant uint64, exp int, denorm bool, bitSize int) {
-	if bitSize == 32 {
-		dboxFtoa32(d, uint32(mant), exp, denorm)
-		return
-	}
-	dboxFtoa64(d, mant, exp, denorm)
-}
-
 func dboxFtoa64(d *decimalSlice, mant uint64, exp int, denorm bool) {
 	if mant == 1<<float64MantBits && !denorm {
 		// Algorithm 5.6 (page 24).
@@ -65,7 +57,8 @@
 	// Algorithm 5.2 (page 15).
 	k0 := -mulLog10_2(exp)
 	φ, β := dboxPow64(κ+k0, exp)
-	zi, exact := dboxMulPow64(uint64(mant*2+1)<<β, φ)
+	mant2 := mant << 1
+	zi, exact := dboxMulPow64(uint64(mant2+1)<<β, φ)
 	s, r := zi/p10κ1, uint32(zi%p10κ1)
 	δi := dboxDelta64(φ, β)
 
@@ -78,7 +71,7 @@
 		s--
 		r = p10κ * 10
 	} else if r == δi {
-		parity, exact := dboxParity64(uint64(mant*2-1), φ, β)
+		parity, exact := dboxParity64(uint64(mant2-1), φ, β)
 		if parity || (exact && mant%2 == 0) {
 			s, zeros := trimZeros(s)
 			dboxDigits(d, s, -k0+1+zeros)
@@ -91,7 +84,7 @@
 	t, ρ := D/p10κ, D%p10κ
 	yru := 10*s + uint64(t)
 	if ρ == 0 {
-		parity, exact := dboxParity64(mant*2, φ, β)
+		parity, exact := dboxParity64(mant2, φ, β)
 		if parity != ((D-p10κ/2)%2 != 0) || exact && yru%2 != 0 {
 			yru--
 		}
@@ -99,7 +92,7 @@
 	dboxDigits(d, yru, -k0)
 }
 
-// Almost identical to dragonboxFtoa64.
+// Almost identical to dboxFtoa64.
 // This is kept as a separate copy to minimize runtime overhead.
 func dboxFtoa32(d *decimalSlice, mant uint32, exp int, denorm bool) {
 	if mant == 1<<float32MantBits && !denorm {
@@ -136,7 +129,8 @@
 	// Algorithm 5.2 (page 15).
 	k0 := -mulLog10_2(exp)
 	φ, β := dboxPow32(κ+k0, exp)
-	zi, exact := dboxMulPow32(uint32(mant*2+1)<<β, φ)
+	mant2 := mant << 1
+	zi, exact := dboxMulPow32(uint32(mant2+1)<<β, φ)
 	s, r := zi/p10κ1, uint32(zi%p10κ1)
 	δi := dboxDelta32(φ, β)
 
@@ -149,7 +143,7 @@
 		s--
 		r = p10κ * 10
 	} else if r == δi {
-		parity, exact := dboxParity32(uint32(mant*2-1), φ, β)
+		parity, exact := dboxParity32(uint32(mant2-1), φ, β)
 		if parity || (exact && mant%2 == 0) {
 			s, zeros := trimZeros(uint64(s))
 			dboxDigits(d, s, -k0+1+zeros)
@@ -162,7 +156,7 @@
 	t, ρ := D/p10κ, D%p10κ
 	yru := 10*s + uint32(t)
 	if ρ == 0 {
-		parity, exact := dboxParity32(mant*2, φ, β)
+		parity, exact := dboxParity32(mant2, φ, β)
 		if parity != ((D-p10κ/2)%2 != 0) || exact && yru%2 != 0 {
 			yru--
 		}
@@ -179,17 +173,6 @@
 	d.dp = d.nd + exp
 }
 
-// uadd128 returns the full 128 bits of u + n.
-func uadd128(u uint128, n uint64) uint128 {
-	sum := uint64(u.Lo + n)
-	// Check if lo is wrapped around.
-	if sum < u.Lo {
-		u.Hi++
-	}
-	u.Lo = sum
-	return u
-}
-
 // umul64 returns the full 64 bits of x * y.
 func umul64(x, y uint32) uint64 {
 	return uint64(x) * uint64(y)
@@ -211,45 +194,12 @@
 	return uint64(uint64(x) * y)
 }
 
-// umul128Upper64 returns the upper 64 bits (out of 128 bits) of x * y.
-func umul128Upper64(x, y uint64) uint64 {
-	a := uint32(x >> 32)
-	b := uint32(x)
-	c := uint32(y >> 32)
-	d := uint32(y)
-
-	ac := umul64(a, c)
-	bc := umul64(b, c)
-	ad := umul64(a, d)
-	bd := umul64(b, d)
-
-	intermediate := (bd >> 32) + uint64(uint32(ad)) + uint64(uint32(bc))
-
-	return ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32)
-}
-
-// umul192Upper128 returns the upper 128 bits (out of 192 bits) of x * y.
-func umul192Upper128(x uint64, y uint128) uint128 {
-	r := umul128(x, y.Hi)
-	t := umul128Upper64(x, y.Lo)
-	return uadd128(r, t)
-}
-
-// umul192Lower128 returns the lower 128 bits (out of 192 bits) of x * y.
-func umul192Lower128(x uint64, y uint128) uint128 {
-	high := x * y.Hi
-	highLow := umul128(x, y.Lo)
-	return uint128{uint64(high + highLow.Hi), highLow.Lo}
-}
-
 // dboxMulPow64 computes x^(i), y^(i), z^(i)
 // from the precomputed value of φ̃k for float64
 // and also checks if x^(f), y^(f), z^(f) == 0 (section 5.2.1).
 func dboxMulPow64(u uint64, phi uint128) (intPart uint64, isInt bool) {
-	r := umul192Upper128(u, phi)
-	intPart = r.Hi
-	isInt = r.Lo == 0
-	return
+	hi, mid, _ := umul192(u, phi)
+	return hi, mid == 0
 }
 
 // dboxMulPow32 computes x^(i), y^(i), z^(i)
@@ -266,9 +216,9 @@
 // from the precomputed value of φ̃k for float64
 // and also checks if x^(f), y^(f), z^(f) = 0 (section 5.2.1).
 func dboxParity64(mant2 uint64, phi uint128, beta int) (parity bool, isInt bool) {
-	r := umul192Lower128(mant2, phi)
-	parity = ((r.Hi >> (64 - beta)) & 1) != 0
-	isInt = ((uint64(r.Hi << beta)) | (r.Lo >> (64 - beta))) == 0
+	_, mid, lo := umul192(mant2, phi)
+	parity = ((mid >> (64 - beta)) & 1) != 0
+	isInt = ((mid << beta) | (lo >> (64 - beta))) == 0
 	return
 }
 
@@ -299,11 +249,6 @@
 	return (e*631305 - 261663) >> 21
 }
 
-const (
-	floatMantBits64 = 52 // p = 52 for float64.
-	floatMantBits32 = 23 // p = 23 for float32.
-)
-
 // dboxRange64 returns the left and right float64 endpoints.
 func dboxRange64(φ uint128, β int) (left, right uint64) {
 	left = (φ.Hi - (φ.Hi >> (float64MantBits + 2))) >> (64 - float64MantBits - 1 - β)
@@ -313,19 +258,19 @@
 
 // dboxRange32 returns the left and right float32 endpoints.
 func dboxRange32(φ uint64, β int) (left, right uint32) {
-	left = uint32((φ - (φ >> (floatMantBits32 + 2))) >> (64 - floatMantBits32 - 1 - β))
-	right = uint32((φ + (φ >> (floatMantBits32 + 1))) >> (64 - floatMantBits32 - 1 - β))
+	left = uint32((φ - (φ >> (float32MantBits + 2))) >> (64 - float32MantBits - 1 - β))
+	right = uint32((φ + (φ >> (float32MantBits + 1))) >> (64 - float32MantBits - 1 - β))
 	return left, right
 }
 
 // dboxRoundUp64 computes the round up of y (i.e., y^(ru)).
 func dboxRoundUp64(phi uint128, beta int) uint64 {
-	return (phi.Hi>>(128/2-floatMantBits64-2-beta) + 1) / 2
+	return (phi.Hi>>(128/2-float64MantBits-2-beta) + 1) / 2
 }
 
 // dboxRoundUp32 computes the round up of y (i.e., y^(ru)).
 func dboxRoundUp32(phi uint64, beta int) uint32 {
-	return uint32(phi>>(64-floatMantBits32-2-beta)+1) / 2
+	return uint32(phi>>(64-float32MantBits-2-beta)+1) / 2
 }
 
 // dboxPow64 gets the precomputed value of φ̃̃k for float64.
diff --git a/src/internal/strconv/math_test.go b/src/internal/strconv/math_test.go
index 55e25f9..995ab95 100644
--- a/src/internal/strconv/math_test.go
+++ b/src/internal/strconv/math_test.go
@@ -62,6 +62,17 @@
 }{
 	{0, u128(0, 0), 0, 0, 0},
 	{^uint64(0), u128(^uint64(0), ^uint64(0)), ^uint64(1), ^uint64(0), 1},
+
+	// Type 1: Cases where mid == 0 (tests isInt logic in dboxMulPow64)
+	{1, u128(0, 1), 0, 0, 1},                     // 1 * {0, 1} = {0, 0, 1}
+	{1 << 32, u128(1<<32, 0), 1, 0, 0},           // 2^32 * {2^32, 0} = {1, 0, 0}
+	{0x100, u128(0x100000000000000, 0), 1, 0, 0}, // 256 * 2^56 = 2^64 = {1, 0, 0}
+
+	// Type 2: Powers of 2 and interesting bit patterns
+	{1 << 32, u128(0, 1<<32), 0, 1, 0},                         // 2^32 * {0, 2^32} = {0, 1, 0}
+	{0x123456789ABCDEF0, u128(1, 0), 0, 0x123456789ABCDEF0, 0}, // Large value * {1, 0}
+	{0x123456789ABCDEF0, u128(0, 2), 0, 0, 0x2468ACF13579BDE0}, // Large value * {0, 2}
+	{1 << 63, u128(2, 0), 1, 0, 0},                             // 2^63 * {2, 0} = {1, 0, 0}
 }
 
 func TestUmul192(t *testing.T) {

Change information

Files:

M src/internal/strconv/ftoa.go
M src/internal/strconv/ftoadbox.go
M src/internal/strconv/math_test.go

Change size: M

Delta: 3 files changed, 36 insertions(+), 76 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Kevin Burke (Gerrit)

unread,

Dec 20, 2025, 2:52:59 AM (2 days ago) Dec 20

to goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Kevin Burke voted Commit-Queue+1

Commit-Queue