memclr optimazation does worse?

T L

unread,

Dec 14, 2016, 9:12:08 AM12/14/16

to golang-nuts

I just read this issue thread: https://github.com/golang/go/issues/5373
and this https://codereview.appspot.com/137880043
which says:

for i := range a {

a[i] = [zero val]

}

will be replaced with memclr.
I made some benchmarks, but the results are disappointing.
When the length of slice/array is very large, memclr is slower.

Result

BenchmarkMemclr_100-4           100000000            37.2 ns/op
BenchmarkLoop_100-4             100000000            70.7 ns/op
BenchmarkMemclr_1000-4          20000000           351 ns/op
BenchmarkLoop_1000-4            10000000           464 ns/op
BenchmarkMemclr_10000-4         1000000          3623 ns/op
BenchmarkLoop_10000-4           1000000          4940 ns/op
BenchmarkMemclr_100000-4        100000         49230 ns/op
BenchmarkLoop_100000-4          100000         58761 ns/op
BenchmarkMemclr_200000-4           50000         98165 ns/op
BenchmarkLoop_200000-4             50000        115833 ns/op
BenchmarkMemclr_300000-4           30000        170617 ns/op
BenchmarkLoop_300000-4             20000        190193 ns/op
BenchmarkMemclr_400000-4           20000        275676 ns/op
BenchmarkLoop_400000-4             20000        288729 ns/op
BenchmarkMemclr_500000-4           10000        410280 ns/op
BenchmarkLoop_500000-4             10000        416195 ns/op
BenchmarkMemclr_1000000-4           5000       1025504 ns/op
BenchmarkLoop_1000000-4             5000       1012198 ns/op
BenchmarkMemclr_2000000-4           2000       2071861 ns/op
BenchmarkLoop_2000000-4             2000       2032703 ns/op

test code:

package main

import "testing"

func memclr(a []int) {
    for i := range a {
        a[i] = 0
    }
}

func memsetLoop(a []int, v int) {
    for i := range a {
        a[i] = v
    }
}

var i = 0

func BenchmarkMemclr_100(b *testing.B) {
    var a = make([]int, 100)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_100(b *testing.B) {
    var a = make([]int, 100)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_1000(b *testing.B) {
    var a = make([]int, 1000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_1000(b *testing.B) {
    var a = make([]int, 1000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_10000(b *testing.B) {
    var a = make([]int, 10000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_10000(b *testing.B) {
    var a = make([]int, 10000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_100000(b *testing.B) {
    var a = make([]int, 100000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_100000(b *testing.B) {
    var a = make([]int, 100000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_200000(b *testing.B) {
    var a = make([]int, 200000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_200000(b *testing.B) {
    var a = make([]int, 200000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_300000(b *testing.B) {
    var a = make([]int, 300000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_300000(b *testing.B) {
    var a = make([]int, 300000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_400000(b *testing.B) {
    var a = make([]int, 400000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_400000(b *testing.B) {
    var a = make([]int, 400000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_500000(b *testing.B) {
    var a = make([]int, 500000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_500000(b *testing.B) {
    var a = make([]int, 500000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_1000000(b *testing.B) {
    var a = make([]int, 1000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_1000000(b *testing.B) {
    var a = make([]int, 1000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

func BenchmarkMemclr_2000000(b *testing.B) {
    var a = make([]int, 2000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_2000000(b *testing.B) {
    var a = make([]int, 2000000)
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memsetLoop(a, i)
    }
}

peterGo

unread,

Dec 14, 2016, 10:38:43 AM12/14/16

to golang-nuts

TL,

For your results, it's a small increase of 1.9%.

For my results, it's a small decrease of −2.03%.

$ go version
go version devel +232991e Wed Dec 14 05:51:01 2016 +0000 linux/amd64
$ go test -bench=.
BenchmarkMemclr_100-4           20000000           115 ns/op
BenchmarkLoop_100-4             5000000           244 ns/op
BenchmarkMemclr_1000-4          1000000          1026 ns/op
BenchmarkLoop_1000-4            1000000          1387 ns/op
BenchmarkMemclr_10000-4         200000         10521 ns/op
BenchmarkLoop_10000-4           100000         14285 ns/op
BenchmarkMemclr_100000-4           10000        146268 ns/op
BenchmarkLoop_100000-4             10000        168871 ns/op
BenchmarkMemclr_200000-4            5000        291458 ns/op
BenchmarkLoop_200000-4              5000        344252 ns/op
BenchmarkMemclr_300000-4            3000        494498 ns/op
BenchmarkLoop_300000-4              2000        602575 ns/op
BenchmarkMemclr_400000-4            2000        734921 ns/op
BenchmarkLoop_400000-4              2000        779482 ns/op
BenchmarkMemclr_500000-4            2000        981884 ns/op
BenchmarkLoop_500000-4              2000       1008058 ns/op
BenchmarkMemclr_1000000-4           1000       2073439 ns/op
BenchmarkLoop_1000000-4             1000       2093744 ns/op
BenchmarkMemclr_2000000-4            300       3932547 ns/op
BenchmarkLoop_2000000-4              300       4132627 ns/op
PASS
ok     tl    34.872s
$

Peter

peterGo

unread,

Dec 14, 2016, 4:21:51 PM12/14/16

to golang-nuts

TL,

To paraphrase: There are lies, damned lies, and benchmarks [statistics].

Let's use another machine.

The results of your benchmarks.

$ go version
go version devel +96414ca Wed Dec 14 19:36:20 2016 +0000 linux/amd64
$ go test -bench=. -cpu=4
BenchmarkMemclr_100-4           100000000            13.0 ns/op
BenchmarkLoop_100-4             50000000            34.2 ns/op
BenchmarkMemclr_1000-4          20000000           110 ns/op
BenchmarkLoop_1000-4            5000000           262 ns/op
BenchmarkMemclr_10000-4         1000000          1080 ns/op
BenchmarkLoop_10000-4           500000          2861 ns/op
BenchmarkMemclr_100000-4        100000         16137 ns/op
BenchmarkLoop_100000-4             50000         31763 ns/op
BenchmarkMemclr_200000-4           50000         31774 ns/op
BenchmarkLoop_200000-4             20000         63448 ns/op
BenchmarkMemclr_300000-4           30000         47662 ns/op
BenchmarkLoop_300000-4             20000         95335 ns/op
BenchmarkMemclr_400000-4           20000         63424 ns/op
BenchmarkLoop_400000-4             10000        127160 ns/op
BenchmarkMemclr_500000-4           20000         81460 ns/op
BenchmarkLoop_500000-4             10000        159163 ns/op
BenchmarkMemclr_1000000-4          10000        204890 ns/op
BenchmarkLoop_1000000-4             5000        327647 ns/op
BenchmarkMemclr_2000000-4           2000        733899 ns/op
BenchmarkLoop_2000000-4             2000        885830 ns/op
PASS
ok     tl    36.282s
$

Memclr is 17.15% faster than Loop for a very large slice.

Peter

not...@google.com

unread,

Dec 14, 2016, 5:50:14 PM12/14/16

to golang-nuts

Be wary of slice size, as caching is going to have an extremely strong effect on the results. I submitted a CL that made append, only clear memory that was not going to be overwritten ( https://github.com/golang/go/commit/c1e267cc734135a66af8a1a5015e572cbb598d44 ). I thought this would have a much larger impact, but it only had a small impact. memclr would zero the memory, but it also brought it into the cache, where it was hot for being overwritten.

Have you tried running with perf to see dcache misses for each benchmark?

sheepbao

unread,

Dec 14, 2016, 8:28:51 PM12/14/16

to golang-nuts

I have the same result in the Mac, go 1.7.1

```go

BenchmarkMemclr_100-4 100000000 22.8 ns/op

BenchmarkLoop_100-4 30000000 47.1 ns/op

BenchmarkMemclr_1000-4 10000000 181 ns/op

BenchmarkLoop_1000-4 5000000 365 ns/op

BenchmarkMemclr_10000-4 500000 2777 ns/op

BenchmarkLoop_10000-4 300000 4003 ns/op

BenchmarkMemclr_100000-4 50000 38993 ns/op

BenchmarkLoop_100000-4 30000 43893 ns/op

BenchmarkMemclr_200000-4 20000 79159 ns/op

BenchmarkLoop_200000-4 20000 87533 ns/op

BenchmarkMemclr_300000-4 10000 127745 ns/op

BenchmarkLoop_300000-4 10000 140770 ns/op

BenchmarkMemclr_400000-4 10000 217689 ns/op

BenchmarkLoop_400000-4 10000 234632 ns/op

BenchmarkMemclr_500000-4 5000 344265 ns/op

BenchmarkLoop_500000-4 2000 535585 ns/op

BenchmarkMemclr_1000000-4 1000 1130508 ns/op

BenchmarkLoop_1000000-4 2000 889592 ns/op

BenchmarkMemclr_2000000-4 1000 2071970 ns/op

BenchmarkLoop_2000000-4 1000 1758001 ns/op

PASS

ok _/Users/bao/program/go/learn/goTour/memclr 37.313s

```

T L

unread,

Dec 14, 2016, 9:05:23 PM12/14/16

to golang-nuts

I'm sorry, there is a mistake in the test code, a fixed version:

package main

import "testing"

type MyInt int32

var initialValue MyInt = 0

func memclr(a []MyInt) {

    for i := range a {
        a[i] = 0
    }
}

func memsetLoop(a []MyInt, v MyInt) {

    for i := range a {
        a[i] = v
    }
}

func BenchmarkMemclr_10(b *testing.B) {
var a = make([]MyInt, 10)

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        memclr(a)
    }
}

func BenchmarkLoop_10(b *testing.B) {
var a = make([]MyInt, 10)