[go] runtime: using wyhash for memhashFallback on 64bit platform

37 views
Skip to first unread message

Meng Zhuo (Gerrit)

unread,
Apr 11, 2021, 10:29:39 PM4/11/21
to Meng Zhuo, goph...@pubsubhelper.golang.org, golang-...@googlegroups.com, Keith Randall, Go Bot, eric fang, Russ Cox, Cuong Manh Le, golang-co...@googlegroups.com

Meng Zhuo submitted this change.

View Change

Approvals: Keith Randall: Looks good to me, approved Meng Zhuo: Trusted; Run TryBots Go Bot: TryBots succeeded
runtime: using wyhash for memhashFallback on 64bit platform

wyhash is a general hash function that:

1. About 8-70% faster that internal maphash
2. Passed Smhasher, BigCrush and PractRand tests

name old time/op new time/op delta
Hash5 28.9ns ± 0% 30.0ns ± 0% +3.77% (p=0.000 n=9+10)
Hash16 32.4ns ± 0% 30.2ns ± 0% -6.74% (p=0.000 n=10+8)
Hash64 52.4ns ± 0% 43.4ns ± 0% -17.20% (p=0.000 n=9+10)
Hash1024 415ns ± 0% 258ns ± 2% -37.89% (p=0.000 n=10+10)
Hash65536 24.9µs ± 0% 14.6µs ± 0% -41.22% (p=0.000 n=9+9)
HashStringSpeed 50.2ns ± 4% 47.8ns ± 4% -4.88% (p=0.000 n=10+10)
HashBytesSpeed 90.1ns ± 7% 78.3ns ± 4% -13.06% (p=0.000 n=10+10)
HashInt32Speed 33.3ns ± 6% 33.6ns ± 4% ~ (p=0.071 n=10+10)
HashInt64Speed 32.7ns ± 3% 34.0ns ± 3% +4.05% (p=0.000 n=9+10)
HashStringArraySpeed 131ns ± 2% 117ns ± 5% -10.32% (p=0.000 n=9+10)
FastrandHashiter 72.2ns ± 1% 75.7ns ±10% +4.87% (p=0.019 n=8+10)

name old speed new speed delta
Hash5 173MB/s ± 0% 167MB/s ± 0% -3.63% (p=0.000 n=9+10)
Hash16 494MB/s ± 0% 530MB/s ± 0% +7.23% (p=0.000 n=10+8)
Hash64 1.22GB/s ± 0% 1.48GB/s ± 0% +20.77% (p=0.000 n=9+10)
Hash1024 2.47GB/s ± 0% 3.97GB/s ± 2% +61.01% (p=0.000 n=8+10)
Hash65536 2.64GB/s ± 0% 4.48GB/s ± 0% +70.13% (p=0.000 n=9+9)

Change-Id: I76af4e2bc1995a18149d11983ea8a149c132865e
Reviewed-on: https://go-review.googlesource.com/c/go/+/279612
Trust: Meng Zhuo <m...@golangcn.org>
Run-TryBot: Meng Zhuo <m...@golangcn.org>
TryBot-Result: Go Bot <go...@golang.org>
Reviewed-by: Keith Randall <k...@golang.org>
---
M src/cmd/compile/internal/ssagen/ssa.go
M src/cmd/compile/internal/test/inl_test.go
M src/runtime/hash64.go
M src/runtime/internal/math/math.go
4 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index 97b9700..de60cbf 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -4484,6 +4484,7 @@
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64)
alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X, sys.ArchMIPS64, sys.ArchMIPS64LE)
+ alias("runtime/internal/math", "Mul64", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X, sys.ArchMIPS64, sys.ArchMIPS64LE)
addF("math/bits", "Add64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go
index fb9942a..6f10003 100644
--- a/src/cmd/compile/internal/test/inl_test.go
+++ b/src/cmd/compile/internal/test/inl_test.go
@@ -175,8 +175,8 @@
want["runtime/internal/sys"] = append(want["runtime/internal/sys"], "Bswap32")
}
if bits.UintSize == 64 {
- // rotl_31 is only defined on 64-bit architectures
- want["runtime"] = append(want["runtime"], "rotl_31")
+ // mix is only defined on 64-bit architectures
+ want["runtime"] = append(want["runtime"], "mix")
}

switch runtime.GOARCH {
diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index 1bee666..5f7d00b 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -3,107 +3,91 @@
// license that can be found in the LICENSE file.

// Hashing algorithm inspired by
-// xxhash: https://code.google.com/p/xxhash/
-// cityhash: https://code.google.com/p/cityhash/
+// wyhash: https://github.com/wangyi-fudan/wyhash

//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
// +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm

package runtime

-import "unsafe"
+import (
+ "runtime/internal/math"
+ "unsafe"
+)

const (
- // Constants for multiplication: four random odd 64-bit numbers.
- m1 = 16877499708836156737
- m2 = 2820277070424839065
- m3 = 9497967016996688599
- m4 = 15839092249703872147
+ m1 = 0xa0761d6478bd642f
+ m2 = 0xe7037ed1a0b428db
+ m3 = 0x8ebc6af09c88c6e3
+ m4 = 0x589965cc75374cc3
+ m5 = 0x1d8e4e27c47d124f
)

func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
- h := uint64(seed + s*hashkey[0])
-tail:
+ var a, b uintptr
+ seed ^= hashkey[0] ^ m1
switch {
case s == 0:
+ return seed
case s < 4:
- h ^= uint64(*(*byte)(p))
- h ^= uint64(*(*byte)(add(p, s>>1))) << 8
- h ^= uint64(*(*byte)(add(p, s-1))) << 16
- h = rotl_31(h*m1) * m2
- case s <= 8:
- h ^= uint64(readUnaligned32(p))
- h ^= uint64(readUnaligned32(add(p, s-4))) << 32
- h = rotl_31(h*m1) * m2
+ a = uintptr(*(*byte)(p))
+ a |= uintptr(*(*byte)(add(p, s>>1))) << 8
+ a |= uintptr(*(*byte)(add(p, s-1))) << 16
+ case s == 4:
+ a = r4(p)
+ b = a
+ case s < 8:
+ a = r4(p)
+ b = r4(add(p, s-4))
+ case s == 8:
+ a = r8(p)
+ b = a
case s <= 16:
- h ^= readUnaligned64(p)
- h = rotl_31(h*m1) * m2
- h ^= readUnaligned64(add(p, s-8))
- h = rotl_31(h*m1) * m2
- case s <= 32:
- h ^= readUnaligned64(p)
- h = rotl_31(h*m1) * m2
- h ^= readUnaligned64(add(p, 8))
- h = rotl_31(h*m1) * m2
- h ^= readUnaligned64(add(p, s-16))
- h = rotl_31(h*m1) * m2
- h ^= readUnaligned64(add(p, s-8))
- h = rotl_31(h*m1) * m2
+ a = r8(p)
+ b = r8(add(p, s-8))
default:
- v1 := h
- v2 := uint64(seed * hashkey[1])
- v3 := uint64(seed * hashkey[2])
- v4 := uint64(seed * hashkey[3])
- for s >= 32 {
- v1 ^= readUnaligned64(p)
- v1 = rotl_31(v1*m1) * m2
- p = add(p, 8)
- v2 ^= readUnaligned64(p)
- v2 = rotl_31(v2*m2) * m3
- p = add(p, 8)
- v3 ^= readUnaligned64(p)
- v3 = rotl_31(v3*m3) * m4
- p = add(p, 8)
- v4 ^= readUnaligned64(p)
- v4 = rotl_31(v4*m4) * m1
- p = add(p, 8)
- s -= 32
+ l := s
+ if l > 48 {
+ seed1 := seed
+ seed2 := seed
+ for ; l > 48; l -= 48 {
+ seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+ seed1 = mix(r8(add(p, 16))^m3, r8(add(p, 24))^seed1)
+ seed2 = mix(r8(add(p, 32))^m4, r8(add(p, 40))^seed2)
+ p = add(p, 48)
+ }
+ seed ^= seed1 ^ seed2
}
- h = v1 ^ v2 ^ v3 ^ v4
- goto tail
+ for ; l > 16; l -= 16 {
+ seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+ p = add(p, 16)
+ }
+ a = r8(add(p, l-16))
+ b = r8(add(p, l-8))
}

- h ^= h >> 29
- h *= m3
- h ^= h >> 32
- return uintptr(h)
+ return mix(m5^s, mix(a^m2, b^seed))
}

func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr {
- h := uint64(seed + 4*hashkey[0])
- v := uint64(readUnaligned32(p))
- h ^= v
- h ^= v << 32
- h = rotl_31(h*m1) * m2
- h ^= h >> 29
- h *= m3
- h ^= h >> 32
- return uintptr(h)
+ a := r4(p)
+ return mix(m5^4, mix(a^m2, a^seed^hashkey[0]^m1))
}

func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr {
- h := uint64(seed + 8*hashkey[0])
- h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
- h = rotl_31(h*m1) * m2
- h ^= h >> 29
- h *= m3
- h ^= h >> 32
- return uintptr(h)
+ a := r8(p)
+ return mix(m5^8, mix(a^m2, a^seed^hashkey[0]^m1))
}

-// Note: in order to get the compiler to issue rotl instructions, we
-// need to constant fold the shift amount by hand.
-// TODO: convince the compiler to issue rotl instructions after inlining.
-func rotl_31(x uint64) uint64 {
- return (x << 31) | (x >> (64 - 31))
+func mix(a, b uintptr) uintptr {
+ hi, lo := math.Mul64(uint64(a), uint64(b))
+ return uintptr(hi ^ lo)
+}
+
+func r4(p unsafe.Pointer) uintptr {
+ return uintptr(readUnaligned32(p))
+}
+
+func r8(p unsafe.Pointer) uintptr {
+ return uintptr(readUnaligned64(p))
}
diff --git a/src/runtime/internal/math/math.go b/src/runtime/internal/math/math.go
index 5385f5d..b6bd12d 100644
--- a/src/runtime/internal/math/math.go
+++ b/src/runtime/internal/math/math.go
@@ -17,3 +17,24 @@
overflow := b > MaxUintptr/a
return a * b, overflow
}
+
+// Mul64 returns the 128-bit product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+// This is a copy from math/bits.Mul64
+// On supported platforms this is an intrinsic lowered by the compiler.
+func Mul64(x, y uint64) (hi, lo uint64) {
+ const mask32 = 1<<32 - 1
+ x0 := x & mask32
+ x1 := x >> 32
+ y0 := y & mask32
+ y1 := y >> 32
+ w0 := x0 * y0
+ t := x1*y0 + w0>>32
+ w1 := t & mask32
+ w2 := t >> 32
+ w1 += x0 * y1
+ hi = x1*y1 + w2 + w1>>32
+ lo = x * y
+ return
+}

To view, visit change 279612. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: go
Gerrit-Branch: master
Gerrit-Change-Id: I76af4e2bc1995a18149d11983ea8a149c132865e
Gerrit-Change-Number: 279612
Gerrit-PatchSet: 5
Gerrit-Owner: Meng Zhuo <m...@golangcn.org>
Gerrit-Reviewer: Cuong Manh Le <cuong.m...@gmail.com>
Gerrit-Reviewer: Go Bot <go...@golang.org>
Gerrit-Reviewer: Keith Randall <k...@golang.org>
Gerrit-Reviewer: Meng Zhuo <m...@golangcn.org>
Gerrit-Reviewer: Russ Cox <r...@golang.org>
Gerrit-CC: eric fang <eric...@arm.com>
Gerrit-MessageType: merged
Reply all
Reply to author
Forward
0 new messages