memclr optimazation does worse?

464 views
Skip to first unread message

T L

unread,
Dec 14, 2016, 9:12:08 AM12/14/16
to golang-nuts
I just read this issue thread: https://github.com/golang/go/issues/5373
and this https://codereview.appspot.com/137880043
which says:

for i := range a {
a[i] = [zero val]
}

will be replaced with memclr.
I made some benchmarks, but the results are disappointing.
When the length of slice/array is very large, memclr is slower.

Result

BenchmarkMemclr_100-4           100000000            37.2 ns/op
BenchmarkLoop_100-4             100000000            70.7 ns/op
BenchmarkMemclr_1000-4          20000000           351 ns/op
BenchmarkLoop_1000-4            10000000           464 ns/op
BenchmarkMemclr_10000-4          1000000          3623 ns/op
BenchmarkLoop_10000-4            1000000          4940 ns/op
BenchmarkMemclr_100000-4          100000         49230 ns/op
BenchmarkLoop_100000-4            100000         58761 ns/op
BenchmarkMemclr_200000-4           50000         98165 ns/op
BenchmarkLoop_200000-4             50000        115833 ns/op
BenchmarkMemclr_300000-4           30000        170617 ns/op
BenchmarkLoop_300000-4             20000        190193 ns/op
BenchmarkMemclr_400000-4           20000        275676 ns/op
BenchmarkLoop_400000-4             20000        288729 ns/op
BenchmarkMemclr_500000-4           10000        410280 ns/op
BenchmarkLoop_500000-4             10000        416195 ns/op
BenchmarkMemclr_1000000-4           5000       1025504 ns/op
BenchmarkLoop_1000000-4             5000       1012198 ns/op
BenchmarkMemclr_2000000-4           2000       2071861 ns/op
BenchmarkLoop_2000000-4             2000       2032703 ns/op

test code:

package main

import "testing"

func memclr(a []int) {
    for i := range a {
        a[i] = 0
    }
}

func memsetLoop(a []int, v int) {
    for i := range a {
        a[i] = v
    }
}

var i = 0

func BenchmarkMemclr_100(b *testing.B) {
    var a = make([]int, 100)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_100(b *testing.B) {
    var a = make([]int, 100)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_1000(b *testing.B) {
    var a = make([]int, 1000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_1000(b *testing.B) {
    var a = make([]int, 1000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_10000(b *testing.B) {
    var a = make([]int, 10000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_10000(b *testing.B) {
    var a = make([]int, 10000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_100000(b *testing.B) {
    var a = make([]int, 100000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_100000(b *testing.B) {
    var a = make([]int, 100000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_200000(b *testing.B) {
    var a = make([]int, 200000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_200000(b *testing.B) {
    var a = make([]int, 200000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_300000(b *testing.B) {
    var a = make([]int, 300000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_300000(b *testing.B) {
    var a = make([]int, 300000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_400000(b *testing.B) {
    var a = make([]int, 400000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_400000(b *testing.B) {
    var a = make([]int, 400000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_500000(b *testing.B) {
    var a = make([]int, 500000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_500000(b *testing.B) {
    var a = make([]int, 500000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_1000000(b *testing.B) {
    var a = make([]int, 1000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_1000000(b *testing.B) {
    var a = make([]int, 1000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_2000000(b *testing.B) {
    var a = make([]int, 2000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_2000000(b *testing.B) {
    var a = make([]int, 2000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}




peterGo

unread,
Dec 14, 2016, 10:38:43 AM12/14/16
to golang-nuts
TL,

For your results, it's a small increase of 1.9%.

For my results, it's a small decrease of −2.03%.

$ go version
go version devel +232991e Wed Dec 14 05:51:01 2016 +0000 linux/amd64
$ go test -bench=.
BenchmarkMemclr_100-4           20000000           115 ns/op
BenchmarkLoop_100-4              5000000           244 ns/op
BenchmarkMemclr_1000-4           1000000          1026 ns/op
BenchmarkLoop_1000-4             1000000          1387 ns/op
BenchmarkMemclr_10000-4           200000         10521 ns/op
BenchmarkLoop_10000-4             100000         14285 ns/op
BenchmarkMemclr_100000-4           10000        146268 ns/op
BenchmarkLoop_100000-4             10000        168871 ns/op
BenchmarkMemclr_200000-4            5000        291458 ns/op
BenchmarkLoop_200000-4              5000        344252 ns/op
BenchmarkMemclr_300000-4            3000        494498 ns/op
BenchmarkLoop_300000-4              2000        602575 ns/op
BenchmarkMemclr_400000-4            2000        734921 ns/op
BenchmarkLoop_400000-4              2000        779482 ns/op
BenchmarkMemclr_500000-4            2000        981884 ns/op
BenchmarkLoop_500000-4              2000       1008058 ns/op
BenchmarkMemclr_1000000-4           1000       2073439 ns/op
BenchmarkLoop_1000000-4             1000       2093744 ns/op
BenchmarkMemclr_2000000-4            300       3932547 ns/op
BenchmarkLoop_2000000-4              300       4132627 ns/op
PASS
ok      tl    34.872s
$


Peter

peterGo

unread,
Dec 14, 2016, 4:21:51 PM12/14/16
to golang-nuts
TL,

To paraphrase: There are lies, damned lies, and benchmarks [statistics].

Let's use another machine.

The results of your benchmarks.

$ go version
go version devel +96414ca Wed Dec 14 19:36:20 2016 +0000 linux/amd64
$ go test -bench=. -cpu=4
BenchmarkMemclr_100-4           100000000            13.0 ns/op
BenchmarkLoop_100-4             50000000            34.2 ns/op
BenchmarkMemclr_1000-4          20000000           110 ns/op
BenchmarkLoop_1000-4             5000000           262 ns/op
BenchmarkMemclr_10000-4          1000000          1080 ns/op
BenchmarkLoop_10000-4             500000          2861 ns/op
BenchmarkMemclr_100000-4          100000         16137 ns/op
BenchmarkLoop_100000-4             50000         31763 ns/op
BenchmarkMemclr_200000-4           50000         31774 ns/op
BenchmarkLoop_200000-4             20000         63448 ns/op
BenchmarkMemclr_300000-4           30000         47662 ns/op
BenchmarkLoop_300000-4             20000         95335 ns/op
BenchmarkMemclr_400000-4           20000         63424 ns/op
BenchmarkLoop_400000-4             10000        127160 ns/op
BenchmarkMemclr_500000-4           20000         81460 ns/op
BenchmarkLoop_500000-4             10000        159163 ns/op
BenchmarkMemclr_1000000-4          10000        204890 ns/op
BenchmarkLoop_1000000-4             5000        327647 ns/op
BenchmarkMemclr_2000000-4           2000        733899 ns/op
BenchmarkLoop_2000000-4             2000        885830 ns/op
PASS
ok      tl    36.282s
$


Memclr is 17.15% faster than Loop for a very large slice.

Peter

not...@google.com

unread,
Dec 14, 2016, 5:50:14 PM12/14/16
to golang-nuts
Be wary of slice size, as caching is going to have an extremely strong effect on the results.  I submitted a CL that made append, only clear memory that was not going to be overwritten ( https://github.com/golang/go/commit/c1e267cc734135a66af8a1a5015e572cbb598d44 ).  I thought this would have a much larger impact, but it only had a small impact.  memclr would zero the memory, but it also brought it into the cache, where it was hot for being overwritten.

Have you tried running with perf to see dcache misses for each benchmark?

sheepbao

unread,
Dec 14, 2016, 8:28:51 PM12/14/16
to golang-nuts
I have the same result in the Mac, go 1.7.1
```go

BenchmarkMemclr_100-4       100000000         22.8 ns/op

BenchmarkLoop_100-4         30000000         47.1 ns/op

BenchmarkMemclr_1000-4      10000000       181 ns/op

BenchmarkLoop_1000-4         5000000       365 ns/op

BenchmarkMemclr_10000-4       500000       2777 ns/op

BenchmarkLoop_10000-4         300000       4003 ns/op

BenchmarkMemclr_100000-4       50000     38993 ns/op

BenchmarkLoop_100000-4         30000     43893 ns/op

BenchmarkMemclr_200000-4       20000     79159 ns/op

BenchmarkLoop_200000-4         20000     87533 ns/op

BenchmarkMemclr_300000-4       10000     127745 ns/op

BenchmarkLoop_300000-4         10000     140770 ns/op

BenchmarkMemclr_400000-4       10000     217689 ns/op

BenchmarkLoop_400000-4         10000     234632 ns/op

BenchmarkMemclr_500000-4        5000     344265 ns/op

BenchmarkLoop_500000-4          2000     535585 ns/op

BenchmarkMemclr_1000000-4       1000   1130508 ns/op

BenchmarkLoop_1000000-4         2000     889592 ns/op

BenchmarkMemclr_2000000-4       1000   2071970 ns/op

BenchmarkLoop_2000000-4         1000   1758001 ns/op

PASS

ok  _/Users/bao/program/go/learn/goTour/memclr 37.313s


```

T L

unread,
Dec 14, 2016, 9:05:23 PM12/14/16
to golang-nuts

I'm sorry, there is a mistake in the test code, a fixed version:

package main

import "testing"

type MyInt int32

var initialValue MyInt = 0

func memclr(a []MyInt) {

    for i := range a {
        a[i] = 0
    }
}

func memsetLoop(a []MyInt, v MyInt) {

    for i := range a {
        a[i] = v
    }
}

func BenchmarkMemclr_10(b *testing.B) {
    var a = make([]MyInt, 10)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_10(b *testing.B) {
    var a = make([]MyInt, 10)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_100(b *testing.B) {
    var a = make([]MyInt, 100)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_100(b *testing.B) {
    var a = make([]MyInt, 100)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_1000(b *testing.B) {
    var a = make([]MyInt, 1000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_1000(b *testing.B) {
    var a = make([]MyInt, 1000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_10000(b *testing.B) {
    var a = make([]MyInt, 10000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_10000(b *testing.B) {
    var a = make([]MyInt, 10000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_100000(b *testing.B) {
    var a = make([]MyInt, 100000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_100000(b *testing.B) {
    var a = make([]MyInt, 100000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_200000(b *testing.B) {
    var a = make([]MyInt, 200000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_200000(b *testing.B) {
    var a = make([]MyInt, 200000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_300000(b *testing.B) {
    var a = make([]MyInt, 300000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_300000(b *testing.B) {
    var a = make([]MyInt, 300000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_400000(b *testing.B) {
    var a = make([]MyInt, 400000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_400000(b *testing.B) {
    var a = make([]MyInt, 400000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_500000(b *testing.B) {
    var a = make([]MyInt, 500000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_500000(b *testing.B) {
    var a = make([]MyInt, 500000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_1000000(b *testing.B) {
    var a = make([]MyInt, 1000000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_1000000(b *testing.B) {
    var a = make([]MyInt, 1000000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

func BenchmarkMemclr_2000000(b *testing.B) {
    var a = make([]MyInt, 2000000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_2000000(b *testing.B) {
    var a = make([]MyInt, 2000000)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, initialValue)
    }
}

The memclr versions are really better,
but its advantage really becomes smaller for large size slices.

T L

unread,
Dec 14, 2016, 9:18:57 PM12/14/16
to golang-nuts
But if I changed the line
type MyInt int32
to
type MyInt int
then again, the memclr version becomes slower, or no advantage, for cases of slice lengths larger than 2000000.

T L

unread,
Dec 14, 2016, 11:57:51 PM12/14/16
to golang-nuts


On Thursday, December 15, 2016 at 10:18:57 AM UTC+8, T L wrote:
But if I changed the line
type MyInt int32
to
type MyInt int
then again, the memclr version becomes slower, or no advantage, for cases of slice lengths larger than 2000000.

Tried other types, looks the situation is more possible happening for types with value size equal to 8 (on amd64).

 

T L

unread,
Dec 15, 2016, 5:03:57 AM12/15/16
to golang-nuts, not...@google.com


On Thursday, December 15, 2016 at 6:50:14 AM UTC+8, not...@google.com wrote:
Be wary of slice size, as caching is going to have an extremely strong effect on the results.  I submitted a CL that made append, only clear memory that was not going to be overwritten ( https://github.com/golang/go/commit/c1e267cc734135a66af8a1a5015e572cbb598d44 ).  I thought this would have a much larger impact, but it only had a small impact.  memclr would zero the memory, but it also brought it into the cache, where it was hot for being overwritten.

Have you tried running with perf to see dcache misses for each benchmark?


how to check cache misses with go pperf?
 

rd

unread,
Dec 15, 2016, 11:36:47 AM12/15/16
to golang-nuts
TL,

As peterGo, I was unable to reproduce your findings:

uname -a
Linux 4.8.0-30-generic #32-Ubuntu SMP Fri Dec 2 03:43:27 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux

go version
go version go1.7.4 linux/amd64

cat /proc/cpuinfo
CPU Intel(R) Core(TM) i7-6560U CPU @ 2.20GHz

go test -bench=.
[...]
BenchmarkMemclr_2000000-4           3000        421532 ns/op
BenchmarkLoop_2000000-4               2000        791318 ns/op

So memclr is ~2x faster on my machine.

In order to see what actually happens, lets use the pprof tool:
go test -bench=. -cpuprofile test.prof

Then `go tool pprof test.prof`, and `top 5` (sanity check):
      flat  flat%   sum%        cum   cum%
     1.69s 57.88% 57.88%      1.69s 57.88%  _/tmp/goperf.memsetLoop
     1.22s 41.78% 99.66%      1.22s 41.78%  runtime.memclr

So far so good, memsetloop and the _runtime_ memclr are being called.

Going down the rabbit hole, lets look at the assembly:
 (pprof) disasm memsetLoop
Total: 2.92s
ROUTINE ======================== _/tmp/goperf.memsetLoop
     1.69s      1.69s (flat, cum) 57.88% of Total
         .          .     46d770: MOVQ 0x10(SP), AX
         .          .     46d775: MOVQ 0x8(SP), CX
         .          .     46d77a: MOVL 0x20(SP), DX
         .          .     46d77e: XORL BX, BX
         .          .     46d780: CMPQ AX, BX
         .          .     46d783: JGE 0x46d790
     400ms      400ms     46d785: MOVL DX, 0(CX)(BX*4)
     1.14s      1.14s     46d788: INCQ BX
     150ms      150ms     46d78b: CMPQ AX, BX
         .          .     46d78e: JL 0x46d785

Standard loop, and definitively not using vectorized instructions (explains the difference on my CPU)

For comparison, the finely hand-tuned memclr implementation is at https://golang.org/src/runtime/memclr_amd64.s (my computer being fairly recent, it takes full advantage of the large registers available).

Can you try to perform the same exercise on your hardware? It will likely shed some lights on the peculiar results you are experiencing.

Regards
RD

Michael Jones

unread,
Dec 15, 2016, 7:57:17 PM12/15/16
to rd, golang-nuts
go version go1.7.4 linux/amd64

BenchmarkMemclr_100-36         500000000        31.5 ns/op
BenchmarkLoop_100-36           200000000        71.7 ns/op
BenchmarkMemclr_1000-36       50000000       257 ns/op
BenchmarkLoop_1000-36         20000000       612 ns/op
BenchmarkMemclr_10000-36       5000000      2675 ns/op
BenchmarkLoop_10000-36         2000000      6280 ns/op
BenchmarkMemclr_100000-36      500000     39956 ns/op
BenchmarkLoop_100000-36        200000     66346 ns/op
BenchmarkMemclr_200000-36      200000     79805 ns/op
BenchmarkLoop_200000-36        100000    132527 ns/op
BenchmarkMemclr_300000-36      200000    119613 ns/op
BenchmarkLoop_300000-36        100000    198872 ns/op
BenchmarkMemclr_400000-36      100000    160355 ns/op
BenchmarkLoop_400000-36         50000    265406 ns/op
BenchmarkMemclr_500000-36      100000    199190 ns/op
BenchmarkLoop_500000-36         50000    331522 ns/op
BenchmarkMemclr_1000000-36       50000    398051 ns/op
BenchmarkLoop_1000000-36         20000    663510 ns/op
BenchmarkMemclr_2000000-36       20000    796084 ns/op
BenchmarkLoop_2000000-36         10000   1326865 ns/op

Uniformly better on my AWS test system:

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                36
On-line CPU(s) list:   0-35
Thread(s) per core:    2
Core(s) per socket:    9
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 63
Model name:            Intel(R) Xeon(R) CPU E5-2666 v3 @ 2.90GHz
Stepping:              2
CPU MHz:               3199.968
BogoMIPS:              6101.39
Hypervisor vendor:     Xen
Virtualization type:   full
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              25600K
NUMA node0 CPU(s):     0-8,18-26
NUMA node1 CPU(s):     9-17,27-35

--
You received this message because you are subscribed to the Google Groups "golang-nuts" group.
To unsubscribe from this group and stop receiving emails from it, send an email to golang-nuts+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.



--
Michael T. Jones
michae...@gmail.com

T L

unread,
Dec 15, 2016, 10:39:35 PM12/15/16
to golang-nuts, rd6...@gmail.com

Thanks for the guild, RD.

Here is the my system info

$ uname -a
Linux debian8 4.2.0-1-amd64 #1 SMP Debian 4.2.6-3 (2015-12-06) x86_64 GNU/Linux
$ go version
go version go1.7.4 linux/amd6
$ cat /proc/cpuinfo
processor    : 0
vendor_id    : GenuineIntel
cpu family    : 6
model        : 42
model name    : Intel(R) Core(TM) i3-2350M CPU @ 2.30GHz
stepping    : 7
microcode    : 0x29
cpu MHz        : 995.828
cache size    : 3072 KB
physical id    : 0
siblings    : 4
core id        : 0
cpu cores    : 2
apicid        : 0
initial apicid    : 0
fpu        : yes
fpu_exception    : yes
cpuid level    : 13
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer xsave avx lahf_lm arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid xsaveopt
bugs        :
bogomips    : 4589.65
clflush size    : 64
cache_alignment    : 64
address sizes    : 36 bits physical, 48 bits virtual
power management:
...(3 more same ones)


Here is the result for MyInt==int

$ go test -bench=. -cpuprofile test.prof
testing: warning: no tests to run
BenchmarkLoop_2000000-4             1000       2026451 ns/op
BenchmarkMemclr_2000000-4           1000       2075557 ns/op
PASS
ok      _/tmp    4.546s
$ go tool pprof test.prof
Entering interactive mode (type "help" for commands)
(pprof) top 5
4.55s of 4.55s total (  100%)
Showing top 5 nodes out of 8 (cum >= 2.31s)

      flat  flat%   sum%        cum   cum%
     2.31s 50.77% 50.77%      2.31s 50.77%  runtime.memclr
     2.24s 49.23%   100%      2.24s 49.23%  _/tmp.memsetLoop
         0     0%   100%      2.24s 49.23%  _/tmp.BenchmarkLoop_2000000
         0     0%   100%      2.31s 50.77%  _/tmp.BenchmarkMemclr_2000000
         0     0%   100%      2.31s 50.77%  _/tmp.memclr
(pprof) disasm memsetLoop
Total: 4.55s
ROUTINE ======================== _/tmp.memsetLoop
     2.24s      2.24s (flat, cum) 49.23% of Total

         .          .     46d770: MOVQ 0x10(SP), AX
         .          .     46d775: MOVQ 0x8(SP), CX
         .          .     46d77a: MOVQ 0x20(SP), DX
         .          .     46d77f: XORL BX, BX
         .          .     46d781: CMPQ AX, BX
         .          .     46d784: JGE 0x46d792
         .          .     46d786: MOVQ DX, 0(CX)(BX*8)
     2.24s      2.24s     46d78a: INCQ BX
         .          .     46d78d: CMPQ AX, BX
         .          .     46d790: JL 0x46d786


And the result for MyInt=int32

$ go test -bench=. -cpuprofile test.prof
testing: warning: no tests to run
BenchmarkLoop_2000000-4             1000       1128167 ns/op
BenchmarkMemclr_2000000-4           2000       1031849 ns/op
PASS
ok      _/tmp    3.460s
$ go tool pprof test.prof
Entering interactive mode (type "help" for commands)
(pprof) top 5  
3.44s of 3.45s total (99.71%)
Dropped 10 nodes (cum <= 0.02s)
Showing top 5 nodes out of 8 (cum >= 2.18s)

      flat  flat%   sum%        cum   cum%
     2.18s 63.19% 63.19%      2.18s 63.19%  runtime.memclr
     1.26s 36.52% 99.71%      1.26s 36.52%  _/tmp.memsetLoop
         0     0% 99.71%      1.26s 36.52%  _/tmp.BenchmarkLoop_2000000
         0     0% 99.71%      2.18s 63.19%  _/tmp.BenchmarkMemclr_2000000
         0     0% 99.71%      2.18s 63.19%  _/tmp.memclr
(pprof)
(pprof) disasm memsetLoop
Total: 3.45s
ROUTINE ======================== _/tmp.memsetLoop
     1.26s      1.26s (flat, cum) 36.52% of Total

         .          .     46d770: MOVQ 0x10(SP), AX
         .          .     46d775: MOVQ 0x8(SP), CX
         .          .     46d77a: MOVL 0x20(SP), DX
         .          .     46d77e: XORL BX, BX
         .          .     46d780: CMPQ AX, BX
         .          .     46d783: JGE 0x46d790
         .          .     46d785: MOVL DX, 0(CX)(BX*4)
     1.25s      1.25s     46d788: INCQ BX
      10ms       10ms     46d78b: CMPQ AX, BX

Sokolov Yura

unread,
Dec 16, 2016, 12:05:39 AM12/16/16
to golang-nuts
Memory is slow. While slice fits to cache, memclr is measurably faster.
When slice doesn't fit cache, memclr at least not significantly faster.

I've heard, adaptive prefetching is turned on if there were 3 consequent accesses to same cache-line in increasing address order. So, perhaps optimised SSE/AVX zeroing doesn't trigger adaptive prefetch cause it uses less memory accesses. And then, it may vary much by CPU model: newer models may fix adaptive prefetch, so that memclr is great again.

Sokolov Yura

unread,
Dec 16, 2016, 12:13:06 AM12/16/16
to golang-nuts
I suppose, prefetch instructions in AVX loop (for block after current) can solve this issue.

rd6...@gmail.com

unread,
Dec 16, 2016, 8:30:12 AM12/16/16
to golang-nuts
Hi Solokov,

interesting idea, but it does not seem that the cache size would be the issue at hand here — please note that slices of more than 524288 integers do not fit in my laptop cache:

 1. The runtime is linear to the size of the array (no "cliff" — see attached picture)

 2. The page misses are ridiculously low (expected since the benchmark yields itself to contiguous allocation and processing):

 Performance counter stats for '/usr/local/go/bin/go test -bench=.':

      20882.464799      task-clock (msec)         #    0.998 CPUs utilized         
             4,947      context-switches          #    0.237 K/sec                 
               239      cpu-migrations            #    0.011 K/sec                 
            23,583      page-faults               #    0.001 M/sec                 
    64,142,963,898      cycles                    #    3.072 GHz                   
    37,246,686,615      instructions              #    0.58  insn per cycle        
     4,753,144,504      branches                  #  227.614 M/sec                 
         2,870,298      branch-misses             #    0.06% of all branches 

I am running the following excerpt from the original code:
package P

import (
   
"strconv"
   
"testing"
)


func memclr
(a []int) {
   
for i := range a {
        a
[i] = 0
   
}
}


func
BenchmarkMemclr(b *testing.B) {
   
for i := 100000; i < 409600000; i *= 2 {
        b
.Run("bench"+strconv.Itoa(i), func(b *testing.B) {
           
var a = make([]int, i)

            b
.ResetTimer()
           
for i := 0; i < b.N; i++ {
                memclr
(a)
           
}

       
})
   
}
}



running_time.png
Reply all
Reply to author
Forward
0 new messages