diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 7b9ad1e..0e25d0e 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1055,6 +1055,8 @@
// Actually do the allocation.
var x unsafe.Pointer
var elemsize uintptr
+
+ // TODO(thepudds): minor, but is it expected that noscan 32KiB (maxSmallSize) uses mallocgcLarge, not the small size path?
if size <= maxSmallSize-gc.MallocHeaderSize {
if typ == nil || !typ.Pointers() {
if size < maxTinySize {
@@ -1299,6 +1301,7 @@
var x unsafe.Pointer
// First, check for a reusable object.
+ // TODO(thepudds): could nextFreeFast, nextFree and nextReusable return unsafe.Pointer? Maybe doesn't matter. Prob some reason for current sig.
if c.hasReusableNoscan(spc) {
// We have a reusable object, use it.
v, span = c.nextReusableNoScan(span, spc)
@@ -1413,11 +1416,64 @@
sizeclass := gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
spc := makeSpanClass(sizeclass, false)
span := c.alloc[spc]
- v := nextFreeFast(span)
+
+ var v gclinkptr
+ var x unsafe.Pointer
+
+ // First, check for a potentially reusable object.
+ if c.hasReusableScan(spc) {
+ // We have special handling for small no header scan objects because we
+ // currently do not have a mechanism to update the heap bits for an object
+ // stored in a span that is no longer in the mcache. (If we were to just
+ // alter the heap bits for a span outside the mcache via heapSetTypeNoHeader, then
+ // for example goroutines could be concurrently altering the heap bits for adjacent objects,
+ // which could clash given the heap bits for multiple objects can be within a single word.)
+ //
+ // nextReusableSmallScanNoHeader verifies that a stored reusable object is
+ // either in an mspan in the mcache (and hence we can update the heap bits), or
+ // has the same heap bits as the object we are trying to allocate and hence
+ // we don't need to update the heap bits to reuse the object. (It might therefore
+ // return 0 if there were stored reusable objects, but not one that could used.)
+ // TODO(thepudds): verify the explanation above is correct.
+ var update bool
+ v, span, update = c.nextReusableScanNoHeader(span, spc, size, typ)
+ if v != 0 {
+ // We have a reusable object, use it.
+ x = unsafe.Pointer(v)
+
+ // This is a previously used object, so always clear (regardless of span.needzero).
+ memclrNoHeapPointers(x, size)
+
+ if update {
+ // TODO(thepudds): is this safe from a conservative scan given freeIndexForScan?
+ heapSetTypeNoHeader(uintptr(x), size, typ, span)
+ }
+
+ size = uintptr(gc.SizeClassToSize[sizeclass])
+
+ // Compensate for the the GC assist credit deducted in mallocgc (before calling us and after we return)
+ // because this is not a newly allocated object. We use the full slot size (elemsize) here because
+ // that's what mallocgc deducts overall.
+ // TODO(thepudds): confirm / test that is correct and the math works out, including when there's internal fragmentation.
+ addAssistCredit(size)
+
+ // See publicationBarrier comment below.
+ // TODO(thepudds): do we need this? GC can have already observed this object via prior incarnation.
+ // TODO(thepudds): alternatively, maybe move this up to just before heapSetTypeNoHeader?
+ publicationBarrier()
+
+ // Finish and return.
+ // Note that we do not update span.freeIndexForScan, profiling info,
+ // nor do we check gcTrigger. (checkGCTrigger is still false here.)
+ goto finish
+ }
+ }
+
+ v = nextFreeFast(span)
if v == 0 {
v, span, checkGCTrigger = c.nextFree(spc)
}
- x := unsafe.Pointer(v)
+ x = unsafe.Pointer(v)
if span.needzero != 0 {
memclrNoHeapPointers(x, size)
}
@@ -1438,24 +1494,6 @@
// but see uninitialized memory or stale heap bits.
publicationBarrier()
- if writeBarrier.enabled {
- // Allocate black during GC.
- // All slots hold nil so no scanning is needed.
- // This may be racing with GC so do it atomically if there can be
- // a race marking the bit.
- gcmarknewobject(span, uintptr(x))
- } else {
- // Track the last free index before the mark phase. This field
- // is only used by the garbage collector. During the mark phase
- // this is used by the conservative scanner to filter out objects
- // that are both free and recently-allocated. It's safe to do that
- // because we allocate-black if the GC is enabled. The conservative
- // scanner produces pointers out of thin air, so without additional
- // synchronization it might otherwise observe a partially-initialized
- // object, which could crash the program.
- span.freeIndexForScan = span.freeindex
- }
-
// Note cache c only valid while m acquired; see #47302
//
// N.B. Use the full size because that matches how the GC
@@ -1469,6 +1507,29 @@
if c.nextSample < 0 || MemProfileRate != c.memProfRate {
profilealloc(mp, x, size)
}
+
+ if !writeBarrier.enabled {
+ // Track the last free index before the mark phase. This field
+ // is only used by the garbage collector. During the mark phase
+ // this is used by the conservative scanner to filter out objects
+ // that are both free and recently-allocated. It's safe to do that
+ // because we allocate-black if the GC is enabled. The conservative
+ // scanner produces pointers out of thin air, so without additional
+ // synchronization it might otherwise observe a partially-initialized
+ // object, which could crash the program.
+ span.freeIndexForScan = span.freeindex
+ }
+
+finish:
+
+ if writeBarrier.enabled {
+ // Allocate black during GC.
+ // All slots hold nil so no scanning is needed.
+ // This may be racing with GC so do it atomically if there can be
+ // a race marking the bit.
+ gcmarknewobject(span, uintptr(x))
+ }
+
mp.mallocing = 0
releasem(mp)
@@ -1511,17 +1572,59 @@
size = uintptr(gc.SizeClassToSize[sizeclass])
spc := makeSpanClass(sizeclass, false)
span := c.alloc[spc]
- v := nextFreeFast(span)
+
+ var v gclinkptr
+ var x unsafe.Pointer
+
+ // First, check for a reusable object.
+ // TODO(thepudds): could nextFreeFast, nextFree and nextReusable return unsafe.Pointer? Maybe doesn't matter. Prob some reason for current sig.
+ if c.hasReusableScan(spc) {
+ // We have a reusable object, use it.
+ v, span = c.nextReusableScanHeader(span, spc)
+ x = unsafe.Pointer(v)
+
+ // Compensate for the the GC assist credit deducted in mallocgc (before calling us and after we return)
+ // because this is not a newly allocated object. We use the full slot size (elemsize) here because
+ // that's what mallocgc deducts overall.
+ // TODO(thepudds): confirm / test the math works out, including that we're not off by 8 and also when there's internal fragmentation.
+ addAssistCredit(size)
+
+ // Object is being reused. Clear it, but leave the header alone so that the GC won't see a nil type.
+ // The header then gets set below to the new type in heapSetTypeSmallHeader.
+ // TODO: This might be a Bad Ideaâ„¢. Normally, the GC would not have an
+ // in-flight observation of this object, but when reusing, it might,
+ // so if we don't leave the type alone here, the GC could see a nil type between
+ // the memclrNoHeapPointers call and the heapSetTypeSmallHeader call below
+ // and deref the nil or throw. (We saw a problem with typePointersOfUnchecked
+ // seeing a nil type, but maybe elsewhere could be unhappy too).
+ // Alternatively, we could let the GC see the nil type and ignore it, though that
+ // might be worse in terms of sanity checking for other potential problems.
+ // TODO: add comment about conservative scanning.
+ memclrNoHeapPointers(add(x, mallocHeaderSize), size-mallocHeaderSize)
+
+ header := (**_type)(x)
+ x = add(x, mallocHeaderSize)
+ // TODO(thepudds): what to do about c.scanAlloc? Probably should not increment here?
+ heapSetTypeSmallHeader(uintptr(x), size-mallocHeaderSize, typ, header, span)
+
+ // See publicationBarrier comment below.
+ // TODO(thepudds): do we need this? GC can have already observed this object this cycle via prior incarnation?
+ publicationBarrier()
+
+ goto finish
+ }
+
+ v = nextFreeFast(span)
if v == 0 {
v, span, checkGCTrigger = c.nextFree(spc)
}
- x := unsafe.Pointer(v)
+ x = unsafe.Pointer(v)
if span.needzero != 0 {
memclrNoHeapPointers(x, size)
}
- header := (**_type)(x)
x = add(x, gc.MallocHeaderSize)
- c.scanAlloc += heapSetTypeSmallHeader(uintptr(x), size-gc.MallocHeaderSize, typ, header, span)
+ // TODO(thepudds): we don't use header variable for now to make goto happy.
+ c.scanAlloc += heapSetTypeSmallHeader(uintptr(x), size-gc.MallocHeaderSize, typ, (**_type)(unsafe.Pointer(uintptr(x)-gc.MallocHeaderSize)), span)
// Ensure that the stores above that initialize x to
// type-safe memory and set the heap bits occur before
@@ -1531,24 +1634,6 @@
// but see uninitialized memory or stale heap bits.
publicationBarrier()
- if writeBarrier.enabled {
- // Allocate black during GC.
- // All slots hold nil so no scanning is needed.
- // This may be racing with GC so do it atomically if there can be
- // a race marking the bit.
- gcmarknewobject(span, uintptr(x))
- } else {
- // Track the last free index before the mark phase. This field
- // is only used by the garbage collector. During the mark phase
- // this is used by the conservative scanner to filter out objects
- // that are both free and recently-allocated. It's safe to do that
- // because we allocate-black if the GC is enabled. The conservative
- // scanner produces pointers out of thin air, so without additional
- // synchronization it might otherwise observe a partially-initialized
- // object, which could crash the program.
- span.freeIndexForScan = span.freeindex
- }
-
// Note cache c only valid while m acquired; see #47302
//
// N.B. Use the full size because that matches how the GC
@@ -1562,6 +1647,29 @@
if c.nextSample < 0 || MemProfileRate != c.memProfRate {
profilealloc(mp, x, size)
}
+
+ if !writeBarrier.enabled {
+ // Track the last free index before the mark phase. This field
+ // is only used by the garbage collector. During the mark phase
+ // this is used by the conservative scanner to filter out objects
+ // that are both free and recently-allocated. It's safe to do that
+ // because we allocate-black if the GC is enabled. The conservative
+ // scanner produces pointers out of thin air, so without additional
+ // synchronization it might otherwise observe a partially-initialized
+ // object, which could crash the program.
+ span.freeIndexForScan = span.freeindex
+ }
+
+finish:
+
+ if writeBarrier.enabled {
+ // Allocate black during GC.
+ // All slots hold nil so no scanning is needed.
+ // This may be racing with GC so do it atomically if there can be
+ // a race marking the bit.
+ gcmarknewobject(span, uintptr(x))
+ }
+
mp.mallocing = 0
releasem(mp)
@@ -1796,6 +1904,15 @@
//
// It is a no-op if called on a stack pointer. It must not be called
// on a global variable in the data or bss sections, which is partially enforced.
+//
+// TODO: early measurements were that passing the size allowed
+// eliminating ~60% of the cost of a free operation. The
+// implementation has changed since then, but if that observation
+// survives being re-measured, it might make sense
+// to always require the size and noscan, and rename freesized to free,
+// though some potential stdlib callers (for example, slices.Collect I think)
+// do not know if the type has pointers, though a compiler inserted
+// call could know.
func freeSized(p unsafe.Pointer, size uintptr, noscan bool) bool {
if !reusableSize(size) {
return false
@@ -1840,6 +1957,8 @@
}
if debug.clobberfree != 0 {
+ // TODO(thepudds): for now, we use the passed in size. For a "real" version of clobberfree, we probably should use
+ // the proper size of the underlying object, and maybe throw if passed in size is outside legal bounds.
clobberfree(p, size)
}
@@ -1850,7 +1969,7 @@
throw("freeSized called without a P or outside bootstrapping")
}
- v := uintptr(p)
+ v := uintptr(p) // TODO: prob gclinkptr?
if !noscan && !heapBitsInSpan(size) {
// mallocgcSmallScanHeader expects to get the base address of the object back from findReusable
// (as well as from nextFreeFast and nextFree), and not mallocHeaderSize bytes into a object,
@@ -1858,6 +1977,7 @@
v -= mallocHeaderSize
// The size class lookup wants size to be adjusted by mallocHeaderSize.
+ // TODO: is that true?
size += mallocHeaderSize
}
@@ -1867,7 +1987,8 @@
} else {
sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)]
}
-
+ // TODO: minor note: we currently don't do 'size = uintptr(gc.SizeClassToSize[sizeclass])', but might want to do that
+ // if we rely more heavily on size in future (or maybe for logging).
spc := makeSpanClass(sizeclass, noscan)
s := c.alloc[spc]
@@ -1882,7 +2003,7 @@
if noscan {
c.addReusableNoscan(spc, uintptr(v))
} else {
- // TODO(thepudds): no-op for now for scan. Implemented in later CL in stack.
+ c.addReusableScan(spc, uintptr(v))
}
// For stats, for now we leave allocCount alone, roughly pretending to the rest
@@ -1940,6 +2061,129 @@
return v, span
}
+// nextReusableScanHeader returns the next reusable object for a span with headers,
+// or 0 if no reusable object is found.
+//
+// It must only be called after hasReusableScan returned true.
+func (c *mcache) nextReusableScanHeader(s *mspan, spc spanClass) (gclinkptr, *mspan) {
+ if !enableReusable || c.reusableScan[spc] == 0 {
+ // TODO(thepudds): checking for nil is redundant here if we assume hasReusable returned true.
+ return 0, s
+ }
+
+ // Pop a reusable pointer.
+ rl := c.reusableLink(spc)
+ v := gclinkptr(rl.ptrs[rl.len-1])
+ rl.len--
+
+ if v == 0 {
+ throw("nextReusable: nil from ptr list")
+ }
+
+ if debugReusableLog {
+ println("reusing from ptr list:", hex(v), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+ }
+ if doubleCheckReusable {
+ doubleCheckNextReusable(v) // debug only sanity check
+ }
+
+ if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+ // The pointer is in an span in the mcache, so we can use it without looking up the span.
+ return v, s
+ }
+
+ // We need to find and return the span so that our caller can gcmarknewobject if needed, update heap bits, etc.
+ span := spanOf(uintptr(v))
+ if span == nil {
+ throw("nextReusable: nil span for pointer in reusablePtrs free list")
+ }
+
+ return v, span
+}
+
+// nextReusableScanNoHeader is like nextReusableScanHeader, but for noheader spans, and
+// for any reusable object that is outside of an mspan currently in mcache, it only returns
+// objects that can be reused without updating the heap bits. In other words, before returning
+// a reusable object outside the mcache, it verifies that the heap bits for
+// the object in its span are equivalent to what would be needed to allocate
+// an object with the requested size and typ.
+//
+// The update bool reports whether the heap bits should be updated.
+//
+// This must only be called after hasReusableScan returned true.
+//
+// TODO(thepudds): this is probably not exactly what we'd want to do for real, but this is
+// at least a concrete demonstration of avoiding the problem of not currently
+// being able to update heap bits for a reused object outside the mcache. For
+// a perfectly unfavorable app alloc/free pattern, this might essentially decay to
+// just reusing objects from an mspan currently in mcache. An alternative
+// would be to just always do that for noheader scan objects, which are smaller,
+// and hence have more objects per span -- 512 objects for 16-byte objects
+// through ~16 objects for the largest noheader objects. In other words,
+// this is generally more to work with within the mcache for noheader scan objects,
+// so maybe it is a reasonable choice to restrict noheader objects to the mcache.
+func (c *mcache) nextReusableScanNoHeader(s *mspan, spc spanClass, size uintptr, typ *_type) (v gclinkptr, span *mspan, update bool) {
+ if !enableReusable || c.reusableScan[spc] == 0 {
+ // TODO(thepudds): checking for nil is redundant here if we assume hasReusable returned true.
+ return 0, s, false
+ }
+
+ rl := c.reusableLink(spc)
+
+ maxDiscard := 4
+ for rl.len > 0 && maxDiscard > 0 {
+ // Pop a reusable pointer, then validate if we can reuse.
+ // We can reuse if still in an mspan in the mcache (which means we can
+ // mutate the heap bits as needed) or if the heap bits match what we need.
+ // If not reusable, keep popping and discarding.
+ // TODO(thepudds): it's debatable how many we should discard before giving up, and whether
+ // it might make sense to scan more than we discard. There is a CPU cost for each one we examine.
+ // There also can be lost reuse opportunities if we throw too many away. For now, somewhat arbitrarily
+ // we discard at most 4 or however many are in this link. The typical count examined in practice
+ // might be something like 1-2? (e.g. if a there's a heap bit pattern that dominates this span class,
+ // or if it discards its way down to mostly staying within the mcache).
+ v = gclinkptr(rl.ptrs[rl.len-1])
+ rl.len--
+ maxDiscard--
+
+ if v == 0 {
+ throw("nextReusable: nil from ptr list")
+ }
+
+ if debugReusableLog {
+ println("reusing from ptr list:", hex(v), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+ }
+ if doubleCheckReusable {
+ doubleCheckNextReusable(v) // debug only sanity check
+ }
+
+ if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+ // The pointer is in an span in the mcache, so we can use that span without calling spanOf,
+ // but we need to update the heap bits.
+ return v, s, true
+ }
+
+ // We need to find and return the span so that our caller can gcmarknewobject if needed, update heap bits, etc.
+ span = spanOf(uintptr(v))
+ if span == nil {
+ throw("nextReusable: nil span for pointer in reusablePtrs free list")
+ }
+
+ // Confirm this reusable object can be used for this size and type without
+ // updating any heap bits. (See this function's doc comment).
+ if !span.heapBitsSmallMatchesType(uintptr(v), size, typ) {
+ continue
+ }
+
+ // We found a usable pointer, and we don't need to update the heap bits.
+ return v, span, false
+ }
+
+ // We popped all the pointers in this link or maxDiscard, but none were usable.
+ // Return the original span.
+ return 0, s, false
+}
+
// doubleCheckNextReusable checks some invariants.
func doubleCheckNextReusable(v gclinkptr) {
// TODO(thepudds): will probably delete some of this. Can mostly be ignored for review.
@@ -2018,6 +2262,18 @@
return newobject(typ)
}
+//go:linkname maps_freeLive internal/runtime/maps.freeLive
+func maps_freeLive(p unsafe.Pointer) bool {
+ // return free(p)
+ return true
+}
+
+//go:linkname strings_freeLive strings.runtimefree
+func strings_freeLive(p unsafe.Pointer) bool {
+ // return free(p)
+ return true
+}
+
//go:linkname strings_freeSized strings.runtimefreesized
func strings_freeSized(p unsafe.Pointer, size uintptr, noscan bool) bool {
return freeSized(p, size, noscan)
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 0972c70..22470ca 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -258,6 +258,22 @@
{"size=512", testFreeSized[[512]byte], true},
{"size=4096", testFreeSized[[4096]byte], true},
{"size=32KiB-8", testFreeSized[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
+
+ // Types with pointers, plus some additional size variation.
+ {"size=16", testFreeSized[[16 / PtrSize]*int], false},
+ {"size=24", testFreeSized[[24 / PtrSize]*int], false},
+ {"size=32", testFreeSized[[32 / PtrSize]*int], false},
+ {"size=64", testFreeSized[[64 / PtrSize]*int], false},
+ {"size=72", testFreeSized[[72 / PtrSize]*int], false},
+ {"size=200", testFreeSized[[200 / PtrSize]*int], false},
+ {"size=504", testFreeSized[[504 / PtrSize]*int], false},
+ {"size=512", testFreeSized[[512 / PtrSize]*int], false},
+ {"size=520", testFreeSized[[520 / PtrSize]*int], false},
+ {"size=4088", testFreeSized[[4088 / PtrSize]*int], false},
+ {"size=4096", testFreeSized[[4096 / PtrSize]*int], false},
+ {"size=4104", testFreeSized[[4104 / PtrSize]*int], false},
+ {"size=14336", testFreeSized[[14336 / PtrSize]*int], false}, // size class just below 16 KiB (ignoring header)
+ {"size=32KiB-8", testFreeSized[[(1<<15 - 8) / PtrSize]*int], false}, // max scan small object for 64-bit
}
// Run the tests twice.
@@ -401,6 +417,104 @@
}
})
+ t.Run("free-alternating-types", func(t *testing.T) {
+ // Confirm we can allocate and free elements in the same size class
+ // but with pointers in different locations while getting the
+ // expected allocation counts.
+ var tZero T
+ tBytes := unsafe.Sizeof(tZero)
+ ptrBytes := unsafe.Sizeof(uintptr(0))
+
+ // Don't bother testing noscan, or odd numbers of pointers in T.
+ // (An odd count of pointers in T would need to be rounded for T2, but
+ // the T2 slice might end up in a different size class, depending on the boundary.)
+ if noscan || (tBytes/ptrBytes)%2 != 0 {
+ return
+ }
+
+ type T2 struct {
+ u uint
+ p *int
+ }
+ allocT2 := func() []T2 {
+ s := make([]T2, (tBytes/ptrBytes)/2)
+ for i := range s {
+ s[i].u = uint(ClobberdeadPtr) // perhaps increase odds of catching pointer confusion
+ }
+ return s
+ }
+ t2Bytes := uintptr(len(allocT2())) * unsafe.Sizeof(T2{})
+
+ type T3 struct {
+ p1 *int
+ p2 *int
+ }
+ allocT3 := func() []T3 {
+ s := make([]T3, (tBytes/ptrBytes)/2)
+ return s
+ }
+ t3Bytes := uintptr(len(allocT3())) * unsafe.Sizeof(T3{})
+
+ if tBytes != t2Bytes || t2Bytes != t3Bytes {
+ t.Fatalf("TestFreeLive: size mismatch: T %d bytes, T2 %d bytes, T3 %d bytes", tBytes, t2Bytes, t3Bytes)
+ }
+
+ // First, test alternating with a T2 slice (which mixes pointer and non-pointer fields).
+ const maxOutstanding = 10
+ s1 := make([]*T, 0, maxOutstanding)
+ s2 := make([][]T2, 0, maxOutstanding)
+ allocs := testing.AllocsPerRun(1000, func() {
+ s1 = s1[:0]
+ s2 = s2[:0]
+ for range maxOutstanding {
+ p1 := alloc()
+ s1 = append(s1, p1)
+ p2 := allocT2()
+ s2 = append(s2, p2)
+ }
+ for i := range s1 {
+ free(s1[i])
+ runtime.FreeSized(unsafe.Pointer(unsafe.SliceData(s2[i])), t2Bytes, false) // !noscan
+ }
+ })
+ t.Logf("TestFreeLive: free-alternating-types: T: %d bytes, T2: %d bytes, allocs %v", tBytes, t2Bytes, allocs)
+
+ if allocs > 1 {
+ // To reuse the most memory, we would free in the opposite order of what we do just above,
+ // but we don't free in that order to exercise things a bit more.
+ // In the current order, the last alloc was for a T2 slice, so the
+ // first free of a T1 pointer of size <= 512 bytes can be a mismatch on heap bits,
+ // which means one reusable object might be discarded at the start of the free loop above.
+ // This is why we can get 1 alloc here, though can also get 0 if the reusable objects
+ // are within an mspan still in the mcache.
+ t.Fatalf("expected 0 or 1 allocations, got %v", allocs)
+ }
+
+ // Second, test alternating with a T3 slice (which is all pointers, and
+ // hence is recognized as having same heap bits as T, which is an array).
+ s3 := make([][]T3, 0, maxOutstanding)
+ allocs = testing.AllocsPerRun(1000, func() {
+ s1 = s1[:0]
+ s3 = s3[:0]
+ for range maxOutstanding {
+ p1 := alloc()
+ s1 = append(s1, p1)
+ p3 := allocT3()
+ s3 = append(s3, p3)
+ }
+ for i := range s1 {
+ free(s1[i])
+ runtime.FreeSized(unsafe.Pointer(unsafe.SliceData(s3[i])), t3Bytes, false) // !noscan
+ }
+ })
+ t.Logf("TestFreeLive: free-alternating-types: T: %d bytes, T3: %d bytes, allocs %v", tBytes, t3Bytes, allocs)
+
+ if allocs != 0 {
+ // The all pointer slice should allow zero allocs.
+ t.Fatalf("expected 0 allocations, got %v", allocs)
+ }
+ })
+
t.Run("duplicate-check", func(t *testing.T) {
// A simple duplicate allocation test. We track what should be the set
// of live pointers in a map across a series of allocs and frees,
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 4625d29..9b69446 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -154,6 +154,27 @@
// Pull the allocation header from the first word of the object.
typ = *(**_type)(unsafe.Pointer(addr))
addr += gc.MallocHeaderSize
+ if typ == nil {
+ // TODO: prior to adjusting mallocgcSmallScanHeader to leave the
+ // header alone when zeroing a reusued object, we could get a nil typ here.
+ // See related comment in mallocgcSmallScanHeader.
+ //
+ // Example prior failure:
+ // === RUN TestFreeSized/ptrs=true/size=27264
+ // typePointersOfUnchecked: typ is nil
+ // addr=0xc0004de008 base=0xc0004de000
+ // span.base()=0xc0004de000 span.limit=0xc0004ec000 span.state=mSpanInUse
+ // span.elemsize=28672 span.nelems=2
+ // span.spanclass=132 noscan=false
+ // span.allocCount=2 nalloc=2 nfreed=0
+ // span.sweepgen=5508 mheap.sweepgen=5508
+ // fatal error: typePointersOfUnchecked: typ is nil
+ //
+ println("typePointersOfUnchecked: typ is nil")
+ print(" addr=", hex(addr), " objBase=", hex(span.objBase(addr)), "\n")
+ dumpSpan(span)
+ throw("typePointersOfUnchecked: typ is nil")
+ }
} else {
// Synchronize with allocator, in case this came from the conservative scanner.
// See heapSetTypeLarge for more details.
@@ -683,6 +704,55 @@
return
}
+// heapBitsSmallMatchesType reports whether the current heap bits stored at the end of
+// span for a small object matches what would be the heap bits for an object of type typ.
+// It has similar requirements as writeHeapBitsSmall, but just returns a bool without
+// updating anything.
+func (span *mspan) heapBitsSmallMatchesType(x, dataSize uintptr, typ *_type) bool {
+ // This is based on writeHeapBitsSmall.
+
+ // TODO(thepudds): if we keep this, we probably could make a fail fast check up front
+ // (e.g., maybe get the spanBits, mask based on typ.Size_, compare to typBits0
+ // prior to doing the work of repeating the bits. For a slice, this would compare the first few bits,
+ // and if they don't match, no need repeat the bits. If they do match, do the bit repeating work.
+ // Could even check the last few bits in parallel.)
+ // TODO(thepudds): to reduce duplicate work when this is called in a loop, could pass back
+ // any final constructed typBits to pass back in again so that we can avoid reconstructing typBits each time,
+ // or maybe simpler/better to just split this into ~2 functions to better split the work in a loop,
+ // which might also allow us to call an inlinable shared helper from writeHeapBitsSmall.
+ // (Constructing the typBits below has an inlining cost of ~42).
+
+ // First, get the live heap bits stored in the span for the object at x.
+ spanBits := span.heapBitsSmallForAddr(x)
+
+ // Second, create the heap bits that correspond to typ.
+ // This always fits in a single uintptr.
+ typBits0 := readUintptr(getGCMask(typ))
+
+ // Create repetitions of the bitmap if we have a small slice backing store.
+ typBits := typBits0
+ if typ.Size_ == goarch.PtrSize {
+ typBits = (1 << (dataSize / goarch.PtrSize)) - 1
+ } else {
+ // N.B. We rely on dataSize being an exact multiple of the type size.
+ // The alternative is to be defensive and mask out src to the length
+ // of dataSize. The purpose is to save on one additional masking operation.
+ if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 {
+ throw("runtime: (*mspan).heapBitsSmallMatchesType: dataSize is not a multiple of typ.Size_")
+ }
+ for i := typ.Size_; i < dataSize; i += typ.Size_ {
+ typBits |= typBits0 << (i / goarch.PtrSize)
+ }
+ if asanenabled {
+ // Mask src down to dataSize. dataSize is going to be a strange size because of
+ // the redzone required for allocations when asan is enabled.
+ typBits &= (1 << (dataSize / goarch.PtrSize)) - 1
+ }
+ }
+
+ return spanBits == typBits
+}
+
// heapSetType* functions record that the new allocation [x, x+size)
// holds in [x, x+dataSize) one or more values of type typ.
// (The number of values is given by dataSize / typ.Size.)
@@ -1001,6 +1071,18 @@
println()
}
+// TODO(thepudds): does something like already exist?
+func dumpSpan(span *mspan) {
+ state := span.state.get()
+ print(" span.base()=", hex(span.base()), " span.limit=", hex(span.limit), " span.state=", mSpanStateNames[state], "\n")
+ print(" span.elemsize=", span.elemsize, " span.nelems=", span.nelems, "\n")
+ print(" span.spanclass=", span.spanclass, " noscan=", span.spanclass.noscan(), "\n")
+ nalloc := uint16(span.countAlloc())
+ nfreed := span.allocCount - nalloc
+ print(" span.allocCount=", span.allocCount, " nalloc=", nalloc, " nfreed=", nfreed, "\n")
+ print(" span.sweepgen=", span.sweepgen, " mheap.sweepgen=", mheap_.sweepgen, "\n")
+}
+
// addb returns the byte pointer p+n.
//
//go:nowritebarrier
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 03769d0..1ce1029 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -50,13 +50,21 @@
// TODO(thepudds): better to interleave alloc and reusableScan/reusableNoscan so that
// a single malloc call can often access both in the same cache line for a given spanClass. It's not
// interleaved right now in part to have slightly smaller diff, and might be negligible effect on current microbenchmarks.
+ // TODO(thepudds): currently very wasteful to have numSpanClasses of reusableScan and reusableNoscan. Could
+ // just cast as needed instead of having two arrays, or could use two arrays of size _NumSizeClasses, or ___.
// reusableNoscan contains linked lists of reusable noscan heap objects, indexed by spanClass.
// The next pointers are stored in the first word of the heap objects.
reusableNoscan [numSpanClasses]gclinkptr
+ // reusableScan contains reusableLinks forming chunked linked lists of reusable scan objects,
+ // indexed by spanClass.
+ reusableScan [numSpanClasses]uintptr
+
stackcache [_NumStackOrders]stackfreelist
+ reusableLinkCache uintptr // free list of *reusableLink. (We don't use fixalloc.free).
+
// flushGen indicates the sweepgen during which this mcache
// was last flushed. If flushGen != mheap_.sweepgen, the spans
// in this mcache are stale and need to the flushed so they
@@ -108,6 +116,9 @@
// Clear reusable slots.
// TODO(thepudds): is this needed? Probably not, probably allocated zeroed, but confirm.
+ for i := range c.reusableScan {
+ c.reusableScan[i] = 0
+ }
for i := range c.reusableNoscan {
c.reusableNoscan[i] = 0
}
@@ -130,7 +141,6 @@
// with the stealing of gcworkbufs during garbage collection to avoid
// a race where the workbuf is double-freed.
// gcworkbuffree(c.gcworkbuf)
-
lock(&mheap_.lock)
mheap_.cachealloc.free(unsafe.Pointer(c))
unlock(&mheap_.lock)
@@ -163,6 +173,8 @@
// Must run in a non-preemptible context since otherwise the owner of
// c could change.
func (c *mcache) refill(spc spanClass) {
+ // println("refill", spc, "c", c, "mheap_.sweepgen", mheap_.sweepgen)
+
// Return the current cached span to the central lists.
s := c.alloc[spc]
@@ -171,10 +183,26 @@
}
// TODO(thepudds): we might be able to allow mallocgcTiny to reuse 16 byte objects from spc==5,
- // but for now, just clear our reusable objects for tinySpanClass.
- if spc == tinySpanClass {
+ // but for now, just clear our reusable objects for tinySpanClass. Similarly, we might have
+ // reusable pointers on a noheader scan span if we left reusable objects that did not have
+ // matching heap bits and then needed to refill on the normal allocation path.
+ // TODO(thepudds): this passes tests, but might be better to just allow the refill to happen
+ // without complaining, rather than clearing. Also, we currently only partially clear (not clearing next),
+ // and only partially check whether to complain (not checking next), which means we are effectively
+ // already allowing refills to proceed with non-empty reusable links.
+ if spc == tinySpanClass || (!spc.noscan() && heapBitsInSpan(s.elemsize)) {
+ if c.reusableScan[spc] != 0 {
+ rl := (*reusableLink)(unsafe.Pointer(c.reusableScan[spc]))
+ rl.len = 0
+ }
c.reusableNoscan[spc] = 0
}
+ if c.reusableScan[spc] != 0 {
+ rl := (*reusableLink)(unsafe.Pointer(c.reusableScan[spc]))
+ if rl.len > 0 {
+ throw("refill of span with reusable pointers remaining on pointer slice")
+ }
+ }
if c.reusableNoscan[spc] != 0 {
throw("refill of span with reusable pointers remaining on pointer free list")
}
@@ -344,6 +372,24 @@
// so we can simply clear the linked list head pointers.
c.reusableNoscan[i] = 0
}
+ for i := range c.reusableScan {
+ // For scan objects, we have chunked linked lists. We save the list nodes in c.reusableLinkCache.
+ // Note: currently the max count of list nodes in each list is quite small.
+ // TODO(thepudds): could consider tracking tail pointers if we end up with longer list lengths,
+ // or maybe we end up with a completely different storage scheme.
+ if c.reusableScan[i] != 0 {
+ // Prepend the list in c.reusableScan[i] to c.reusableLinkCache.
+ // First, find the tail. Note we don't clear or reset each link,
+ // and instead do that when pulling from the cache.
+ rl := (*reusableLink)(unsafe.Pointer(c.reusableScan[i]))
+ for rl.next != 0 {
+ rl = (*reusableLink)(unsafe.Pointer(rl.next))
+ }
+ rl.next = c.reusableLinkCache
+ c.reusableLinkCache = c.reusableScan[i]
+ c.reusableScan[i] = 0
+ }
+ }
// Update heapLive and heapScan.
gcController.update(dHeapLive, scanAlloc)
@@ -373,6 +419,112 @@
c.flushGen.Store(mheap_.sweepgen) // Synchronizes with gcStart
}
+// TODO(thepudds): for tracking the set of reusable objects, we currently have two parallel approaches:
+//
+// 1. For noscan objects: a linked list of heap object, with the next pointers stored
+// in the first word of the formerly live heap objects.
+//
+// 2. For scan objects: a chunked linked list, with N pointers to heap objects per chunk
+// and the chunks linked together.
+//
+// The first approach is simpler and likely better performance. Importantly, it also uses
+// zero extra memory, and we do not need any cap on how many noscan objects we track.
+//
+// However, we don't use the first approach for scan objects because a linked list for
+// an mspan with types that mostly have user pointers in the first slot
+// might have a performance hiccup where the GC could examine one of the objects
+// in the free list (for example via a queued pointer during mark, or maybe an unlucky conservative scan),
+// and then the GC might then walk and mark the ~complete list of objects
+// in the free list by following the next pointers from that (otherwise dead) object,
+// while thinking the next pointers are valid user pointers.
+//
+// One alternative I might try for scan objects is storing the next pointer at the end of the allocation slot
+// when there is room, which is likely often the case, especially once you get past the smallest size classes.
+// For example, if the user allocates a 1024-byte user object, that ends up in the 1152-byte size class
+// (due to the existing type header) with ~120 bytes otherwise unused at the end of the allocation slot.
+// A drawback of this is it is not a complete solution because some heap objects do not have spare space,
+// so we might still need a chunked list or similar. (We could bump up a size class when needed,
+// but that is probably too wasteful. Separately, instead of a footer, when there is naturally
+// enough spare space, we could consider sliding the start of the user object an extra 8 bytes
+// so that the runtime can then use the second word of the allocation slot for the next pointer,
+// which might have better performance than a footer, but might be more complex).
+//
+// One idea from Michael K. is possibly setting the type header to nil for scan objects > 512 bytes,
+// and then use the second word of the heap object for the next pointer. If done properly, the nil
+// type header could mean the GC does not interpret the next pointer value as a user pointer.
+// The GC will currently throw if it finds an object with a nil type header, though
+// we likely could make not complain about that. If we did this approach, we would still need
+// a separate solution for <= 512 byte scan objects.
+
+// reusableLink is a node in an chunked linked list of reusable objects,
+// used for scan objects.
+type reusableLink struct {
+ // TODO(thepudds): should this include sys.NotInHeap?
+
+ // next points to the next reusableLink in the list.
+ next uintptr
+ // len is the number of reusable objects in this link.
+ len int32
+ // rest is the number of links in the list following this one,
+ // used to track max number of links.
+ rest int32
+ // ptrs contains pointers to reusable objects.
+ // The pointer is to the base of the heap object.
+ // TODO(thepudds): maybe aim for 64 bytes or 128 bytes (2 cache lines if aligned).
+ ptrs [6]uintptr
+}
+
+func (c *mcache) allocReusableLink() uintptr {
+ var rl *reusableLink
+ if c.reusableLinkCache != 0 {
+ // Pop from the free list of reusableLinks.
+ rl = (*reusableLink)(unsafe.Pointer(c.reusableLinkCache))
+ c.reusableLinkCache = rl.next
+ } else {
+ systemstack(func() {
+ lock(&mheap_.reusableLinkLock)
+ rl = (*reusableLink)(mheap_.reusableLinkAlloc.alloc())
+ unlock(&mheap_.reusableLinkLock)
+ })
+ }
+ // TODO(thepudds): do a general review -- double-check we conservatively clearing next pointers (e.g., after popping).
+ rl.next = 0
+ rl.len = 0
+ rl.rest = 0
+ return uintptr(unsafe.Pointer(rl))
+}
+
+// addReusableScan adds an object pointer to a chunked linked list of reusable pointers
+// for a span class.
+func (c *mcache) addReusableScan(spc spanClass, ptr uintptr) {
+ if !enableReusable {
+ return
+ }
+
+ rl := (*reusableLink)(unsafe.Pointer(c.reusableScan[spc]))
+ if rl == nil || rl.len == int32(len(rl.ptrs)) {
+ var linkCount int32
+ if rl != nil {
+ linkCount = rl.rest + 1
+ if linkCount == 4 {
+ // Currently allow at most 4 links per span class.
+ // We are full, so drop this reusable ptr.
+ // TODO(thepudds): pick a max. Maybe a max per mcache instead or in addition to a max per span class?
+ return
+ }
+ }
+ // Prepend a new link.
+ new := (*reusableLink)(unsafe.Pointer(c.allocReusableLink()))
+ new.next = uintptr(unsafe.Pointer(rl))
+ c.reusableScan[spc] = uintptr(unsafe.Pointer(new))
+
+ new.rest = linkCount
+ rl = new
+ }
+ rl.ptrs[rl.len] = ptr
+ rl.len++
+}
+
// addReusableNoscan adds a noscan object pointer to the reusable pointer free list
// for a span class.
func (c *mcache) addReusableNoscan(spc spanClass, ptr uintptr) {
@@ -386,6 +538,16 @@
c.reusableNoscan[spc] = v
}
+// hasReusableScan reports whether there is a reusable object available for
+// a scan spc.
+func (c *mcache) hasReusableScan(spc spanClass) bool {
+ if !enableReusable || c.reusableScan[spc] == 0 {
+ return false
+ }
+ rl := (*reusableLink)(unsafe.Pointer(c.reusableScan[spc]))
+ return rl.len > 0 || rl.next != 0
+}
+
// hasReusableNoscan reports whether there is a reusable object available for
// a noscan spc.
func (c *mcache) hasReusableNoscan(spc spanClass) bool {
@@ -394,3 +556,28 @@
}
return c.reusableNoscan[spc] != 0
}
+
+// reusableLink returns a reusableLink containing reusable pointers for the span class.
+// It must only be called after hasReusableScan has returned true.
+func (c *mcache) reusableLink(spc spanClass) *reusableLink {
+ rl := (*reusableLink)(unsafe.Pointer(c.reusableScan[spc]))
+ if rl.len == 0 {
+ if rl.next == 0 {
+ throw("nonEmptyReusableLink: no reusableLink with reusable objects")
+ }
+
+ // Make the next link head of the list.
+ next := (*reusableLink)(unsafe.Pointer(rl.next))
+ c.reusableScan[spc] = uintptr(unsafe.Pointer(next))
+ if next.len != int32(len(next.ptrs)) {
+ throw("nextReusableScan: next reusable link is not full")
+ }
+
+ // Prepend the empty link to reusableLinkCache.
+ rl.next = c.reusableLinkCache
+ c.reusableLinkCache = uintptr(unsafe.Pointer(rl))
+
+ rl = next
+ }
+ return rl
+}
diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index be977af..536af8f 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -55,6 +55,7 @@
// using the allocator to obtain chunks of memory.
func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
if size > _FixAllocChunk {
+ println("runtime: fixalloc of size", size)
throw("runtime: fixalloc size too large")
}
size = max(size, unsafe.Sizeof(mlink{}))
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index ddbb57a..2d8ee75 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -213,8 +213,13 @@
pad [(cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize) % cpu.CacheLinePadSize]byte
}
+ // TODO(thepudds): probably need to set up lock rank for reusableLinkLock, or maybe simpler
+ // to use an existing lock like mheap_.lock.
+
spanalloc fixalloc // allocator for span*
cachealloc fixalloc // allocator for mcache*
+ reusableLinkAlloc fixalloc // allocator for reusableLink*
+ reusableLinkLock mutex // lock for reusableLink allocator
specialfinalizeralloc fixalloc // allocator for specialfinalizer*
specialCleanupAlloc fixalloc // allocator for specialCleanup*
specialCheckFinalizerAlloc fixalloc // allocator for specialCheckFinalizer*
@@ -768,6 +773,9 @@
// heapArenaOf returns the heap arena for p, if one exists.
func heapArenaOf(p uintptr) *heapArena {
+ // TODO(thepudds): question: does this always return nil if p is not heap allocated,
+ // or like spanOf, might it return a non-nil *heapArena for a non heap p? (Could
+ // possibly use as slightly cheaper sanity check, maybe for globals).
ri := arenaIndex(p)
if arenaL1Bits == 0 {
// If there's no L1, then ri.l1() can't be out of bounds but ri.l2() can.
@@ -794,6 +802,7 @@
h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
+ h.reusableLinkAlloc.init(unsafe.Sizeof(reusableLink{}), nil, nil, &memstats.mcache_sys)
h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
h.specialCleanupAlloc.init(unsafe.Sizeof(specialCleanup{}), nil, nil, &memstats.other_sys)
h.specialCheckFinalizerAlloc.init(unsafe.Sizeof(specialCheckFinalizer{}), nil, nil, &memstats.other_sys)