Gerrit Bot (Gerrit)

unread,

Jul 16, 2025, 8:18:25 AMJul 16

to goph...@pubsubhelper.golang.org, Georgian-Vlad Saioc, golang-co...@googlegroups.com

Gerrit Bot has uploaded the change for review

Commit message

Proposal #74609: goroutine leak detection by using the garbage collector

runtime: detect goroutine leaks via the `DetectDeadlocks` API

Proposal #74609

Change-Id: I97a754b128aac1bc5b7b9ab607fcd5bb390058c8

GitHub-Last-Rev: 352dc0d30e58e8e09a25e357c2531ffe59b6aef6

GitHub-Pull-Request: golang/go#74622

Change diff

diff --git a/src/internal/goexperiment/exp_deadlockgc_off.go b/src/internal/goexperiment/exp_deadlockgc_off.go
new file mode 100644
index 0000000..185171f
--- /dev/null
+++ b/src/internal/goexperiment/exp_deadlockgc_off.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build !goexperiment.deadlockgc
+
+package goexperiment
+
+const DeadlockGC = false
+const DeadlockGCInt = 0
diff --git a/src/internal/goexperiment/exp_deadlockgc_on.go b/src/internal/goexperiment/exp_deadlockgc_on.go
new file mode 100644
index 0000000..5c2b07c
--- /dev/null
+++ b/src/internal/goexperiment/exp_deadlockgc_on.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build goexperiment.deadlockgc
+
+package goexperiment
+
+const DeadlockGC = true
+const DeadlockGCInt = 1
diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go
index 63a3388..969f975 100644
--- a/src/internal/goexperiment/flags.go
+++ b/src/internal/goexperiment/flags.go
@@ -129,4 +129,7 @@
 
 	// GreenTeaGC enables the Green Tea GC implementation.
 	GreenTeaGC bool
+
+	// DeadlockGC enables the Deadlock GC implementation.
+	DeadlockGC bool
 }
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 9a4611e..14028db 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1208,7 +1208,7 @@
 	s.releasetime = 0
 	s.acquiretime = 0
 	s.ticket = 0
-	t.semTable.rootFor(addr).queue(addr, s, false)
+	t.semTable.rootFor(addr).queue(addr, s, false, false)
 }
 
 // Dequeue simulates dequeuing a waiter for a semaphore (or lock) at addr.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index f2df1a0..b7105aa 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -373,17 +373,32 @@
 
 	// Number of roots of various root types. Set by gcPrepareMarkRoots.
 	//
-	// nStackRoots == len(stackRoots), but we have nStackRoots for
-	// consistency.
-	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
+	// During normal GC cycle, nStackRoots == nLiveStackRoots == len(stackRoots)
+	// during deadlock detection GC, nLiveStackRoots is the number of stackRoots
+	// to examine, and nStackRoots == len(stackRoots), which include goroutines that are
+	// unmarked / not runnable
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nLiveStackRoots int
+
+	// The GC has performed deadlock detection during this GC cycle.
+	detectedDeadlocks bool
+
+	// Is set to true by DetectDeadlocks(), instructing the next GC cycle to perform deadlock detection.
+	pendingDeadlockDetection bool
+
+	// When set, the GC is running in deadlock detection mode.
+	// This can be triggered with a runtime flag.
+	deadlockDetectionMode bool
 
 	// Base indexes of each root type. Set by gcPrepareMarkRoots.
 	baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
 
-	// stackRoots is a snapshot of all of the Gs that existed
-	// before the beginning of concurrent marking. The backing
-	// store of this must not be modified because it might be
-	// shared with allgs.
+	// stackRoots is a snapshot of all of the Gs that existed before the
+	// beginning of concurrent marking.  During deadlock detection GC, stackRoots
+	// is partitioned into two sets; to the left of nLiveStackRoots are stackRoots
+	// of running / runnable goroutines and to the right of nLiveStackRoots are
+	// stackRoots of unmarked / not runnable goroutines
+	// gcDiscoverMoreStackRoots modifies the stackRoots array to redo the partition
+	// after each marking phase
 	stackRoots []*g
 
 	// Each type of GC state transition is protected by a lock.
@@ -550,6 +565,29 @@
 	releasem(mp)
 }
 
+// DetectDeadlocks instructs the Go garbage collector to attempt
+// partial deadlock detection.
+//
+// Only operates if deadlockgc is enabled in GOEXPERIMENT.
+// Otherwise, it just runs runtime.GC().
+func DetectDeadlocks() {
+	if !goexperiment.DeadlockGC {
+		GC()
+		return
+	}
+
+	// This write should be thread-safe, as the overwritten value is true.
+	// pendingDeadlockDetection is only set to false under STW at the start
+	// of the GC cycle that picks it up.
+	work.pendingDeadlockDetection = true
+
+	// This read should be thread-safe for the same reason as the write above above.
+	// At most, we trigger the GC an additional time.
+	for work.pendingDeadlockDetection {
+		GC()
+	}
+}
+
 // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
 // already completed this mark phase, it returns immediately.
 func gcWaitOnMark(n uint32) {
@@ -695,6 +733,11 @@
 		mode = gcForceMode
 	} else if debug.gcstoptheworld == 2 {
 		mode = gcForceBlockMode
+	} else if goexperiment.DeadlockGC {
+		if work.pendingDeadlockDetection {
+			// Fully stop the world if running deadlock detection.
+			mode = gcForceBlockMode
+		}
 	}
 
 	// Ok, we're doing it! Stop everybody else
@@ -757,6 +800,7 @@
 	clearpools()
 
 	work.cycles.Add(1)
+	work.detectedDeadlocks = false
 
 	// Assists and workers can start the moment we start
 	// the world.
@@ -788,6 +832,14 @@
 	// possible.
 	setGCPhase(_GCmark)
 
+	if goexperiment.DeadlockGC {
+		if work.pendingDeadlockDetection {
+			// Write is thread-safe because the world is stopped
+			work.deadlockDetectionMode = true
+			work.pendingDeadlockDetection = false
+		}
+	}
+
 	gcBgMarkPrepare() // Must happen before assists are enabled.
 	gcPrepareMarkRoots()
 
@@ -888,6 +940,11 @@
 	// Ensure only one thread is running the ragged barrier at a
 	// time.
 	semacquire(&work.markDoneSema)
+	if goexperiment.DeadlockGC {
+		if work.deadlockDetectionMode {
+			gcDiscoverMoreStackRoots()
+		}
+	}
 
 top:
 	// Re-check transition condition under transition lock.
@@ -947,8 +1004,7 @@
 	// communicated work since we took markDoneSema. Therefore
 	// there are no grey objects and no more objects can be
 	// shaded. Transition to mark termination.
-	now := nanotime()
-	work.tMarkTerm = now
+	var now int64
 	getg().m.preemptoff = "gcing"
 	var stw worldStop
 	systemstack(func() {
@@ -994,6 +1050,54 @@
 		})
 		semrelease(&worldsema)
 		goto top
+	} else if goexperiment.DeadlockGC {
+		// Otherwise, do a deadlock detection round.
+		// Only do one deadlock detection round per GC cycle.
+		if work.deadlockDetectionMode && !work.detectedDeadlocks {
+			work.detectedDeadlocks = detectDeadlocks()
+
+			getg().m.preemptoff = ""
+			systemstack(func() {
+				// Accumulate the time we were stopped before we had to start again.
+				work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)
+
+				now := startTheWorldWithSema(0, stw)
+				work.pauseNS += now - stw.startedStopping
+			})
+			semrelease(&worldsema)
+			goto top
+		}
+
+		now = nanotime()
+		work.tMarkTerm = now
+		// Check again whether any P needs to flush its write barrier
+		// to the GC work queue.
+		systemstack(func() {
+			for _, p := range allp {
+				wbBufFlush1(p)
+				if !p.gcw.empty() {
+					restart = true
+					break
+				}
+			}
+		})
+
+		// If that is the case, restart again. Once restarts are no longer needed,
+		// run this without deadlock detection.
+		if restart {
+			gcDebugMarkDone.restartedDueTo27993 = true
+
+			getg().m.preemptoff = ""
+			systemstack(func() {
+				// Accumulate the time we were stopped before we had to start again.
+				work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)
+
+				now := startTheWorldWithSema(0, stw)
+				work.pauseNS += now - stw.startedStopping
+			})
+			semrelease(&worldsema)
+			goto top
+		}
 	}
 
 	gcComputeStartingStackSize()
@@ -1032,6 +1136,171 @@
 	gcMarkTermination(stw)
 }
 
+// Check if an object is marked in the heap.
+func checkIfMarked(p unsafe.Pointer) bool {
+	obj, span, objIndex := findObject(uintptr(p), 0, 0)
+	if obj != 0 {
+		mbits := span.markBitsForIndex(objIndex)
+		return mbits.isMarked()
+	}
+	// if we fall through to get here, we are within the stack ranges of reachable goroutines
+	return true
+}
+
+// maybeLive checks whether a goroutine may still be semantically runnable.
+// This returns true if the goroutine is waiting on at least one concurrency primitive
+// which is reachable in memory, i.e., has been by the GC.
+//
+// For goroutines which are semantically runnable, this will eventually return true
+// as the GC marking phase progresses.
+func (gp *g) maybeLive() bool {
+	// Unmask the goroutine address to ensure we are not
+	// dereferencing a masked address.
+	gp = gp.unmask()
+
+	switch gp.waitreason {
+	case waitReasonSelectNoCases,
+		waitReasonChanSendNilChan,
+		waitReasonChanReceiveNilChan:
+		// Select with no cases or communicating on nil channels
+		// make goroutines unrunnable by definition.
+		return false
+	case waitReasonChanReceive,
+		waitReasonSelect,
+		waitReasonChanSend:
+		// Cycle all through all *sudog to check whether
+		// the goroutine is waiting on a marked channel.
+		for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+			if checkIfMarked(unsafe.Pointer(sg.c)) {
+				return true
+			}
+		}
+		return false
+	case waitReasonSyncCondWait,
+		waitReasonSyncWaitGroupWait,
+		waitReasonSyncMutexLock,
+		waitReasonSyncRWMutexLock,
+		waitReasonSyncRWMutexRLock:
+		// If waiting on mutexes, wait groups, or condition variables,
+		// check if the synchronization primitive attached to the sudog is marked.
+		if gp.waiting != nil {
+			// Unmask the sema address and check if it's marked.
+			return checkIfMarked(gcUnmask(gp.waiting.elem))
+		}
+	}
+	return true
+}
+
+// unmask returns a *g object with an unmasked address.
+//
+//go:nosplit
+func (gp *g) unmask() *g {
+	return (*g)(gcUnmask(unsafe.Pointer(gp)))
+}
+
+// mask returns a *g object with a masked address.
+//
+//go:nosplit
+func (gp *g) mask() *g {
+	return (*g)(gcMask(unsafe.Pointer(gp)))
+}
+
+// Check to see if more blocked but marked goroutines exist;
+// if so add them into root set and increment work.markrootJobs accordingly
+// return true if we need to run another phase of markroots; return false otherwise
+func gcDiscoverMoreStackRoots() {
+	// to begin with we have a set of unchecked stackRoots between
+	// vIndex and ivIndex.  During the loop, anything < vIndex should be
+	// valid stackRoots and anything >= ivIndex should be invalid stackRoots
+	// and the loop terminates when the two indices meet
+	var vIndex, ivIndex int = work.nLiveStackRoots, work.nStackRoots
+
+	// Reorder goroutine list
+	for vIndex < ivIndex {
+		gp := work.stackRoots[vIndex]
+		if gp.maybeLive() {
+			work.stackRoots[vIndex] = gp
+			vIndex = vIndex + 1
+			continue
+		}
+		for ivIndex = ivIndex - 1; ivIndex != vIndex; ivIndex = ivIndex - 1 {
+			if swapGp := work.stackRoots[ivIndex]; swapGp.maybeLive() {
+				work.stackRoots[ivIndex] = gp
+				work.stackRoots[vIndex] = swapGp.unmask()
+				vIndex = vIndex + 1
+				break
+			}
+		}
+	}
+
+	var oldRootJobs int32 = int32(atomic.Load(&work.markrootJobs))
+	var newRootJobs int32 = int32(work.baseStacks) + int32(vIndex)
+
+	if newRootJobs > oldRootJobs {
+		// reset markrootNext as it could have been incremented past markrootJobs
+		work.nLiveStackRoots = vIndex
+		atomic.Store(&work.markrootJobs, uint32(newRootJobs))
+	}
+}
+
+// detectDeadlocks scans the remaining stackRoots and marks any which are
+// blocked over exclusively unreachable concurrency primitives as leaked (deadlocked).
+// Returns true if goroutine leak was performed (or unnecessary).
+// Returns false if the GC cycle has not yet reached a fix point for reachable goroutines.
+func detectDeadlocks() bool {
+	// Report deadlocks and mark them unreachable, and resume marking
+	// we still need to mark these unreachable *g structs as they
+	// get reused, but their stack won't get scanned
+	if work.nLiveStackRoots == work.nStackRoots {
+		// nStackRoots == nLiveStackRoots means that all goroutines are marked.
+		return true
+	}
+
+	// Try to reach another fix point here. Keep scouting for runnable goroutines until
+	// none are left.
+	// Valid goroutines may be found after all GC work is drained.
+	// Make sure these are pushed to the runnable set and ready to be marked.
+	var foundMoreWork bool
+	for i := work.nLiveStackRoots; i < work.nStackRoots; i++ {
+		gp := work.stackRoots[i].unmask()
+		if readgstatus(gp) == _Gwaiting && !gp.maybeLive() {
+			// Blocking unrunnable goroutines will be skipped.
+			continue
+		}
+		work.stackRoots[i] = work.stackRoots[work.nLiveStackRoots]
+		work.stackRoots[work.nLiveStackRoots] = gp
+		work.nLiveStackRoots += 1
+		// We now have one more markroot job.
+		work.markrootJobs += 1
+		// We might still have some work to do.
+		// Make sure in the next iteration we will check re-check for new runnable goroutines.
+		foundMoreWork = true
+	}
+	if foundMoreWork {
+		// We found more work, so we need to resume the marking phase.
+		return false
+	}
+
+	// For the remaining goroutines, mark them as unreachable and deadlocking.
+	for i := work.nLiveStackRoots; i < work.nStackRoots; i++ {
+		gp := work.stackRoots[i].unmask()
+		casgstatus(gp, _Gwaiting, _Gdeadlocked)
+		fn := findfunc(gp.startpc)
+		if fn.valid() {
+			print("goroutine leak! goroutine ", gp.goid, ": ", funcname(fn), " Stack size: ", gp.stack.hi-gp.stack.lo, " bytes\n")
+		} else {
+			print("goroutine leak! goroutine ", gp.goid, ": !unnamed goroutine!", " Stack size: ", gp.stack.hi-gp.stack.lo, " bytes\n")
+		}
+		traceback(gp.sched.pc, gp.sched.sp, gp.sched.lr, gp)
+		println()
+		work.stackRoots[i] = gp
+	}
+	// Put the remaining roots as ready for marking and drain them.
+	work.markrootJobs += uint32(work.nStackRoots - work.nLiveStackRoots)
+	work.nLiveStackRoots = work.nStackRoots
+	return true
+}
+
 // World must be stopped and mark assists and background workers must be
 // disabled.
 func gcMarkTermination(stw worldStop) {
@@ -1185,6 +1454,13 @@
 	}
 
 	systemstack(func() {
+		if goexperiment.DeadlockGC {
+			// Pull the GC out of deadlock detection mode.
+			// Write is thread-safe because the world is stopped, and only one
+			// GC cycle can run at a time.
+			work.deadlockDetectionMode = false
+		}
+
 		// The memstats updated above must be updated with the world
 		// stopped to ensure consistency of some values, such as
 		// sched.idleTime and sched.totaltime. memstats also include
@@ -1612,10 +1888,12 @@
 	if !work.full.empty() || !work.spanq.empty() {
 		return true // global work available
 	}
-	if work.markrootNext < work.markrootJobs {
-		return true // root scan work available
+	if !work.deadlockDetectionMode {
+		return work.markrootNext < work.markrootJobs
 	}
-	return false
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	return rootNext < rootJobs
 }
 
 // gcMark runs the mark (or, for concurrent GC, mark termination)
@@ -1628,8 +1906,10 @@
 	work.tstart = startTime
 
 	// Check that there's no marking work remaining.
-	if work.full != 0 || work.markrootNext < work.markrootJobs || !work.spanq.empty() {
-		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, " spanq.n=", work.spanq.size(), "\n")
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if work.full != 0 || rootNext < rootJobs {
+		print("runtime: full=", hex(work.full), " next=", rootNext, " jobs=", rootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
 		panic("non-empty mark queue after concurrent mark")
 	}
 
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index a136c7a..1e13ecc 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -51,8 +51,86 @@
 	// Must be a multiple of the pageInUse bitmap element size and
 	// must also evenly divide pagesPerArena.
 	pagesPerSpanRoot = 512
+
+	gcUndoBitMask = uintptr(uintptrMask >> 2) // This constant reserves some bits of the address space for the GC to use in order to mask addresses.
+	gcBitMask     = ^gcUndoBitMask            // This flips every bit in gcUndoBitMask of uinptr width
 )
 
+// gcMask masks addresses that should not be automatically marked during the GC.
+//
+//go:nosplit
+func gcMask(p unsafe.Pointer) unsafe.Pointer {
+	if goexperiment.DeadlockGC {
+		return unsafe.Pointer(uintptr(p) | gcBitMask)
+	}
+	return p
+}
+
+// gcUnmask undoes the bit-mask applied to a pointer.
+//
+//go:nosplit
+func gcUnmask(p unsafe.Pointer) unsafe.Pointer {
+	if goexperiment.DeadlockGC {
+		return unsafe.Pointer(uintptr(p) & gcUndoBitMask)
+	}
+	return p
+}
+
+// internalBlocked returns true if the goroutine is blocked due to a
+// non-deadlocking waitReason, e.g. waiting for the netpoller or garbage collector.
+// Such goroutines should never be considered for deadlock detection.
+//
+//go:nosplit
+func (gp *g) internalBlocked() bool {
+	reason := gp.waitreason
+	return reason != waitReasonChanReceive &&
+		reason != waitReasonSyncWaitGroupWait &&
+		reason != waitReasonChanSend &&
+		reason != waitReasonChanReceiveNilChan &&
+		reason != waitReasonChanSendNilChan &&
+		reason != waitReasonSelect &&
+		reason != waitReasonSelectNoCases &&
+		reason != waitReasonSyncMutexLock &&
+		reason != waitReasonSyncRWMutexRLock &&
+		reason != waitReasonSyncRWMutexLock &&
+		reason != waitReasonSyncCondWait
+}
+
+// The world must be stopped or allglock must be held.
+// go through the snapshot of allgs, putting them into an arrays,
+// separated by index, where [0:blockedIndex] contains only running Gs
+// allGs[blockedIndex:] contain only blocking Gs
+// To avoid GC from marking and scanning the blocked Gs by scanning
+// the returned array (which is heap allocated), we mask the highest
+// bit of the pointers to Gs with gcBitMask.
+func allGsSnapshotSortedForGC() ([]*g, int) {
+	assertWorldStoppedOrLockHeld(&allglock)
+
+	allgsSorted := make([]*g, len(allgs))
+
+	// Indices cutting off runnable and blocked Gs.
+	var currIndex, blockedIndex = 0, len(allgsSorted) - 1
+	for _, gp := range allgs {
+		gp = gp.unmask()
+		// not sure if we need atomic load because we are stopping the world,
+		// but do it just to be safe for now
+		if status := readgstatus(gp); status != _Gwaiting || gp.internalBlocked() {
+			allgsSorted[currIndex] = gp
+			currIndex++
+		} else {
+			allgsSorted[blockedIndex] = gp.mask()
+			blockedIndex--
+		}
+	}
+
+	// Because the world is stopped or allglock is held, allgadd
+	// cannot happen concurrently with this. allgs grows
+	// monotonically and existing entries never change, so we can
+	// simply return a copy of the slice header. For added safety,
+	// we trim everything past len because that can still change.
+	return allgsSorted, blockedIndex + 1
+}
+
 // gcPrepareMarkRoots queues root scanning jobs (stacks, globals, and
 // some miscellany) and initializes scanning-related state.
 //
@@ -102,11 +180,23 @@
 	// ignore them because they begin life without any roots, so
 	// there's nothing to scan, and any roots they create during
 	// the concurrent phase will be caught by the write barrier.
-	work.stackRoots = allGsSnapshot()
+	if goexperiment.DeadlockGC {
+		if work.deadlockDetectionMode {
+			work.stackRoots, work.nLiveStackRoots = allGsSnapshotSortedForGC()
+		} else {
+			// regular GC --- scan every go routine
+			work.stackRoots = allGsSnapshot()
+			work.nLiveStackRoots = len(work.stackRoots)
+		}
+	} else {
+		// regular GC --- scan every go routine
+		work.stackRoots = allGsSnapshot()
+		work.nLiveStackRoots = len(work.stackRoots)
+	}
 	work.nStackRoots = len(work.stackRoots)
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nLiveStackRoots)
 
 	// Calculate base indexes of each root type
 	work.baseData = uint32(fixedRootCount)
@@ -119,8 +209,10 @@
 // gcMarkRootCheck checks that all roots have been scanned. It is
 // purely for debugging.
 func gcMarkRootCheck() {
-	if work.markrootNext < work.markrootJobs {
-		print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if rootNext < rootJobs {
+		print(rootNext, " of ", rootJobs, " markroot jobs done\n")
 		throw("left over markroot jobs")
 	}
 
@@ -868,7 +960,7 @@
 	case _Grunning:
 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
 		throw("scanstack: goroutine not stopped")
-	case _Grunnable, _Gsyscall, _Gwaiting:
+	case _Grunnable, _Gsyscall, _Gwaiting, _Gdeadlocked:
 		// ok
 	}
 
@@ -1136,6 +1228,32 @@
 	gcDrain(gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 }
 
+func gcUpdateMarkrootNext() (uint32, bool) {
+	var success bool
+	var next uint32 = atomic.Load(&work.markrootNext)
+	var jobs uint32 = atomic.Load(&work.markrootJobs)
+
+	if next < jobs {
+		// still work available at the moment
+		for !success {
+			success = atomic.Cas(&work.markrootNext, next, next+1)
+			// We manage to snatch a root job. Return the root index.
+			if success {
+				return next, true
+			}
+
+			// Get the latest value of markrootNext.
+			next = atomic.Load(&work.markrootNext)
+			jobs := atomic.Load(&work.markrootJobs)
+			// We are out of markroot jobs.
+			if next >= jobs {
+				break
+			}
+		}
+	}
+	return 0, false
+}
+
 // gcDrain scans roots and objects in work buffers, blackening grey
 // objects until it is unable to get more work. It may return before
 // GC is done; it's the caller's responsibility to balance work from
@@ -1194,13 +1312,16 @@
 		}
 	}
 
-	// Drain root marking jobs.
-	if work.markrootNext < work.markrootJobs {
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if rootNext < rootJobs {
 		// Stop if we're preemptible, if someone wants to STW, or if
 		// someone is calling forEachP.
+		//
+		// Continue unconditionally if we're draining partial deadlocks.
 		for !(gp.preempt && (preemptible || sched.gcwaiting.Load() || pp.runSafePointFn != 0)) {
-			job := atomic.Xadd(&work.markrootNext, +1) - 1
-			if job >= work.markrootJobs {
+			job, success := gcUpdateMarkrootNext()
+			if !success {
 				break
 			}
 			markroot(gcw, job, flushBgCredit)
@@ -1346,9 +1467,9 @@
 					wbBufFlush()
 					if b = gcw.tryGetObj(); b == 0 {
 						// Try to do a root job.
-						if work.markrootNext < work.markrootJobs {
-							job := atomic.Xadd(&work.markrootNext, +1) - 1
-							if job < work.markrootJobs {
+						if atomic.Load(&work.markrootNext) < atomic.Load(&work.markrootJobs) {
+							job, success := gcUpdateMarkrootNext()
+							if success {
 								workFlushed += markroot(gcw, job, false)
 								continue
 							}
@@ -1513,6 +1634,14 @@
 		// At this point we have extracted the next potential pointer.
 		// Quickly filter out nil and pointers back to the current object.
 		if obj != 0 && obj-b >= n {
+			if goexperiment.DeadlockGC {
+				// The GC will skip masked addresses if DeadlockGC is enabled.
+				if (uintptr(unsafe.Pointer(obj)) & gcBitMask) == gcBitMask {
+					// Skip masked pointers.
+					continue
+				}
+			}
+
 			// Test if obj points into the Go heap and, if so,
 			// mark the object.
 			//
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index c41c355..f2e9889 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -160,7 +160,7 @@
 			s = _Gwaiting
 			fallthrough
 
-		case _Grunnable, _Gsyscall, _Gwaiting:
+		case _Grunnable, _Gsyscall, _Gwaiting, _Gdeadlocked:
 			// Claim goroutine by setting scan bit.
 			// This may race with execution or readying of gp.
 			// The scan bit keeps it from transition state.
@@ -269,6 +269,7 @@
 
 	case _Grunnable | _Gscan,
 		_Gwaiting | _Gscan,
+		_Gdeadlocked | _Gscan,
 		_Gsyscall | _Gscan:
 		casfrom_Gscanstatus(gp, s, s&^_Gscan)
 	}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index b41bbe9..08de431 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -8,6 +8,7 @@
 	"internal/abi"
 	"internal/cpu"
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/goos"
 	"internal/runtime/atomic"
 	"internal/runtime/exithook"
@@ -689,7 +690,7 @@
 	}
 
 	lock(&allglock)
-	allgs = append(allgs, gp)
+	allgs = append(allgs, gp.mask())
 	if &allgs[0] != allgptr {
 		atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0]))
 	}
@@ -708,6 +709,11 @@
 	// monotonically and existing entries never change, so we can
 	// simply return a copy of the slice header. For added safety,
 	// we trim everything past len because that can still change.
+	if goexperiment.DeadlockGC {
+		for i, gp := range allgs {
+			allgs[i] = gp.unmask()
+		}
+	}
 	return allgs[:len(allgs):len(allgs)]
 }
 
@@ -729,7 +735,7 @@
 func forEachG(fn func(gp *g)) {
 	lock(&allglock)
 	for _, gp := range allgs {
-		fn(gp)
+		fn(gp.unmask())
 	}
 	unlock(&allglock)
 }
@@ -742,7 +748,7 @@
 	ptr, length := atomicAllG()
 	for i := uintptr(0); i < length; i++ {
 		gp := atomicAllGIndex(ptr, i)
-		fn(gp)
+		fn(gp.unmask())
 	}
 	return
 }
@@ -1208,6 +1214,7 @@
 		_Gscanwaiting,
 		_Gscanrunning,
 		_Gscansyscall,
+		_Gscandeadlocked,
 		_Gscanpreempted:
 		if newval == oldval&^_Gscan {
 			success = gp.atomicstatus.CompareAndSwap(oldval, newval)
@@ -1228,6 +1235,7 @@
 	case _Grunnable,
 		_Grunning,
 		_Gwaiting,
+		_Gdeadlocked,
 		_Gsyscall:
 		if newval == oldval|_Gscan {
 			r := gp.atomicstatus.CompareAndSwap(oldval, newval)
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 527611f..7e5d276 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -87,6 +87,9 @@
 	// ready()ing this G.
 	_Gpreempted // 9
 
+	// _Gdeadlocked represents a deadlocked goroutine caught by the GC.
+	_Gdeadlocked // 10
+
 	// _Gscan combined with one of the above states other than
 	// _Grunning indicates that GC is scanning the stack. The
 	// goroutine is not executing user code and the stack is owned
@@ -104,6 +107,8 @@
 	_Gscansyscall   = _Gscan + _Gsyscall   // 0x1003
 	_Gscanwaiting   = _Gscan + _Gwaiting   // 0x1004
 	_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
+
+	_Gscandeadlocked = _Gscan + _Gdeadlocked // 0x100a
 )
 
 const (
@@ -1159,12 +1164,26 @@
 	return waitReasonStrings[w]
 }
 
+// isMutexWait returns true if the goroutine is blocked because of
+// sync.Mutex.Lock or sync.RWMutex.[R]Lock.
+//
+//go:nosplit
 func (w waitReason) isMutexWait() bool {
 	return w == waitReasonSyncMutexLock ||
 		w == waitReasonSyncRWMutexRLock ||
 		w == waitReasonSyncRWMutexLock
 }
 
+// isSyncWait returns true if the goroutine is blocked because of
+// sync library primitive operations.
+//
+//go:nosplit
+func (w waitReason) isSyncWait() bool {
+	return w == waitReasonSyncWaitGroupWait ||
+		w == waitReasonSyncCondWait ||
+		w.isMutexWait()
+}
+
 func (w waitReason) isWaitingForSuspendG() bool {
 	return isWaitingForSuspendG[w]
 }
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 6af49b1..d728858 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -21,6 +21,7 @@
 
 import (
 	"internal/cpu"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
 	"unsafe"
 )
@@ -188,7 +189,7 @@
 		}
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
-		root.queue(addr, s, lifo)
+		root.queue(addr, s, lifo, reason.isSyncWait())
 		goparkunlock(&root.lock, reason, traceBlockSync, 4+skipframes)
 		if s.ticket != 0 || cansemacquire(addr) {
 			break
@@ -301,9 +302,18 @@
 }
 
 // queue adds s to the blocked goroutines in semaRoot.
-func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
+func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool, syncSema bool) {
 	s.g = getg()
-	s.elem = unsafe.Pointer(addr)
+	pAddr := unsafe.Pointer(addr)
+	if goexperiment.DeadlockGC {
+		if syncSema {
+			// Mask the addr so it doesn't get marked during GC
+			// through marking of the treap or marking of the blocked goroutine
+			pAddr = gcMask(unsafe.Pointer(addr))
+			s.g.waiting = s
+		}
+	}
+	s.elem = pAddr
 	s.next = nil
 	s.prev = nil
 	s.waiters = 0
@@ -311,7 +321,13 @@
 	var last *sudog
 	pt := &root.treap
 	for t := *pt; t != nil; t = *pt {
-		if t.elem == unsafe.Pointer(addr) {
+		var cmp bool
+		if goexperiment.DeadlockGC {
+			cmp = uintptr(gcUnmask(pAddr)) == uintptr(gcUnmask(t.elem))
+		} else {
+			cmp = uintptr(pAddr) == uintptr(t.elem)
+		}
+		if cmp {
 			// Already have addr in list.
 			if lifo {
 				// Substitute s in t's place in treap.
@@ -357,7 +373,12 @@
 			return
 		}
 		last = t
-		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
+		if goexperiment.DeadlockGC {
+			cmp = uintptr(gcUnmask(pAddr)) < uintptr(gcUnmask(t.elem))
+		} else {
+			cmp = uintptr(pAddr) < uintptr(t.elem)
+		}
+		if cmp {
 			pt = &t.prev
 		} else {
 			pt = &t.next
@@ -402,11 +423,24 @@
 func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now, tailtime int64) {
 	ps := &root.treap
 	s := *ps
+
 	for ; s != nil; s = *ps {
-		if s.elem == unsafe.Pointer(addr) {
+		var cmp bool
+		if goexperiment.DeadlockGC {
+			cmp = gcUnmask(unsafe.Pointer(addr)) == gcUnmask(s.elem)
+		} else {
+			cmp = unsafe.Pointer(addr) == s.elem
+		}
+		if cmp {
 			goto Found
 		}
-		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
+
+		if goexperiment.DeadlockGC {
+			cmp = uintptr(gcUnmask(unsafe.Pointer(addr))) < uintptr(gcUnmask(s.elem))
+		} else {
+			cmp = uintptr(unsafe.Pointer(addr)) < uintptr(s.elem)
+		}
+		if cmp {
 			ps = &s.prev
 		} else {
 			ps = &s.next
@@ -470,6 +504,9 @@
 		}
 		tailtime = s.acquiretime
 	}
+	if goexperiment.DeadlockGC {
+		s.g.waiting = nil
+	}
 	s.parent = nil
 	s.elem = nil
 	s.next = nil
@@ -590,6 +627,10 @@
 	// Enqueue itself.
 	s := acquireSudog()
 	s.g = getg()
+	if goexperiment.DeadlockGC {
+		s.elem = gcMask(unsafe.Pointer(l))
+		s.g.waiting = s
+	}
 	s.ticket = t
 	s.releasetime = 0
 	t0 := int64(0)
@@ -607,6 +648,10 @@
 	if t0 != 0 {
 		blockevent(s.releasetime-t0, 2)
 	}
+	if goexperiment.DeadlockGC {
+		s.g.waiting = nil
+		s.elem = nil
+	}
 	releaseSudog(s)
 }
 
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 00c0f08..44f2897 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -1199,14 +1199,15 @@
 }
 
 var gStatusStrings = [...]string{
-	_Gidle:      "idle",
-	_Grunnable:  "runnable",
-	_Grunning:   "running",
-	_Gsyscall:   "syscall",
-	_Gwaiting:   "waiting",
-	_Gdead:      "dead",
-	_Gcopystack: "copystack",
-	_Gpreempted: "preempted",
+	_Gidle:       "idle",
+	_Grunnable:   "runnable",
+	_Grunning:    "running",
+	_Gsyscall:    "syscall",
+	_Gwaiting:    "waiting",
+	_Gdead:       "dead",
+	_Gcopystack:  "copystack",
+	_Gdeadlocked: "deadlocked",
+	_Gpreempted:  "preempted",
 }
 
 func goroutineheader(gp *g) {
diff --git a/src/runtime/tracestatus.go b/src/runtime/tracestatus.go
index 03ec81f..e04f012 100644
--- a/src/runtime/tracestatus.go
+++ b/src/runtime/tracestatus.go
@@ -122,7 +122,7 @@
 		tgs = tracev2.GoRunning
 	case _Gsyscall:
 		tgs = tracev2.GoSyscall
-	case _Gwaiting, _Gpreempted:
+	case _Gwaiting, _Gpreempted, _Gdeadlocked:
 		// There are a number of cases where a G might end up in
 		// _Gwaiting but it's actually running in a non-preemptive
 		// state but needs to present itself as preempted to the

Change information

Files:

A src/internal/goexperiment/exp_deadlockgc_off.go
A src/internal/goexperiment/exp_deadlockgc_on.go
M src/internal/goexperiment/flags.go
M src/runtime/export_test.go
M src/runtime/mgc.go
M src/runtime/mgcmark.go
M src/runtime/preempt.go
M src/runtime/proc.go
M src/runtime/runtime2.go
M src/runtime/sema.go
M src/runtime/traceback.go
M src/runtime/tracestatus.go

Change size: L

Delta: 12 files changed, 549 insertions(+), 47 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gopher Robot (Gerrit)

unread,

Jul 16, 2025, 8:18:27 AMJul 16

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Gopher Robot added 1 comment

Patchset-level comments

File-level comment, Patchset 1 (Latest):

Gopher Robot . unresolved

I spotted some possible problems.

These findings are based on simple heuristics. If a finding appears wrong, briefly reply here saying so. Otherwise, please address any problems and update the GitHub PR. When complete, mark this comment as 'Done' and click the [blue 'Reply' button](https://go.dev/wiki/GerritBot#i-left-a-reply-to-a-comment-in-gerrit-but-no-one-but-me-can-see-it) above.

Possible problems detected:
  1. The commit title should start with the primary affected package name followed by a colon, like "net/http: improve [...]".
  2. Are you using markdown? Markdown should not be used to augment text in the commit message.
  3. Do you have the right bug reference format? For this repo, the format is usually 'Fixes #12345' or 'Updates #12345' at the end of the commit message.

The commit title and commit message body come from the GitHub PR title and description, and must be edited in the GitHub web interface (not via git). For instructions, see [here](https://go.dev/wiki/GerritBot/#how-does-gerritbot-determine-the-final-commit-message). For guidelines on commit messages for the Go project, see [here](https://go.dev/doc/contribute#commit_messages).

(In general for Gerrit code reviews, the change author is expected to [log in to Gerrit](https://go-review.googlesource.com/login/) with a Gmail or other Google account and then close out each piece of feedback by marking it as 'Done' if implemented as suggested or otherwise reply to each review comment. See the [Review](https://go.dev/doc/contribute#review) section of the Contributing Guide for details.)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review

No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gopher Robot (Gerrit)

unread,

Jul 16, 2025, 8:22:16 AMJul 16

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Message from Gopher Robot

Congratulations on opening your first change. Thank you for your contribution!

Next steps:
A maintainer will review your change and provide feedback. See
https://go.dev/doc/contribute#review for more info and tips to get your
patch through code review.

Most changes in the Go project go through a few rounds of revision. This can be
surprising to people new to the project. The careful, iterative review process
is our way of helping mentor contributors and ensuring that their contributions
have a lasting impact.

During May-July and Nov-Jan the Go project is in a code freeze, during which
little code gets reviewed or merged. If a reviewer responds with a comment like
R=go1.11 or adds a tag like "wait-release", it means that this CL will be
reviewed as part of the next development cycle. See https://go.dev/s/release
for more details.

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 16, 2025, 8:51:51 AMJul 16

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall and Michael Knyszek

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #2 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 16, 2025, 8:59:24 AMJul 16

to Gerrit Bot, goph...@pubsubhelper.golang.org, Milind Chabbi, Austin Clements, Keith Randall, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall and Michael Knyszek

Georgian-Vlad Saioc added 2 comments

Patchset-level comments

File-level comment, Patchset 1:

Gopher Robot . resolved

I spotted some possible problems.
These findings are based on simple heuristics. If a finding appears wrong, briefly reply here saying so. Otherwise, please address any problems and update the GitHub PR. When complete, mark this comment as 'Done' and click the [blue 'Reply' button](https://go.dev/wiki/GerritBot#i-left-a-reply-to-a-comment-in-gerrit-but-no-one-but-me-can-see-it) above.
Possible problems detected:
  1. The commit title should start with the primary affected package name followed by a colon, like "net/http: improve [...]".
  2. Are you using markdown? Markdown should not be used to augment text in the commit message.
  3. Do you have the right bug reference format? For this repo, the format is usually 'Fixes #12345' or 'Updates #12345' at the end of the commit message.
The commit title and commit message body come from the GitHub PR title and description, and must be edited in the GitHub web interface (not via git). For instructions, see [here](https://go.dev/wiki/GerritBot/#how-does-gerritbot-determine-the-final-commit-message). For guidelines on commit messages for the Go project, see [here](https://go.dev/doc/contribute#commit_messages).

(In general for Gerrit code reviews, the change author is expected to [log in to Gerrit](https://go-review.googlesource.com/login/) with a Gmail or other Google account and then close out each piece of feedback by marking it as 'Done' if implemented as suggested or otherwise reply to each review comment. See the [Review](https://go.dev/doc/contribute#review) section of the Contributing Guide for details.)

Georgian-Vlad Saioc

Done

File-level comment, Patchset 2 (Latest):

Georgian-Vlad Saioc . resolved

Marked as unresolved.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek

Submit Requirements:

Code-Review

No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 16, 2025, 8:59:42 AMJul 16

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall and Michael Knyszek

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #3 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 17, 2025, 7:21:36 AMJul 17

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #4 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek

Michael Pratt

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Keith Randall (Gerrit)

unread,

Jul 17, 2025, 8:13:39 PMJul 17

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Michael Knyszek and Michael Pratt

Keith Randall voted and added 1 comment

Votes added by Keith Randall

Hold

+1

1 comment

Patchset-level comments

File-level comment, Patchset 4 (Latest):

Keith Randall . resolved

On hold pending the proposal.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review

No-Holds

No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 18, 2025, 7:30:45 AMJul 18

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #5 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Jul 18, 2025, 10:57:47 AMJul 18

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds added 2 comments

Patchset-level comments

File-level comment, Patchset 5 (Latest):

t hepudds . unresolved

Hi Vlad, very exciting to see! Are you planning on linking to https://github.com/VladSaiocUber/go-gc-proposal either here in the CL description and/or on #74609? That link seems to provide a more concrete overview than the current summary at #74609 (though maybe that link is out of sync or something, or maybe there is some other reason you haven't linked it yet... or maybe I just missed it).

File src/internal/goexperiment/flags.go

Line 134, Patchset 5 (Latest): GolfGC bool

t hepudds . unresolved

Is the "F" in "GOLF" accurate for this CL and proposal? My initial impression was the "Fixer" part of the GOLF paper is effectively not part of this proposal, but maybe I misunderstood.

Rather than using `GolfGC` as the GOEXPERIMENT name, one idea would be to instead just spell it out, such as maybe `GoroutineLeak` and hence used as `GOEXPERIMENT=goroutineleak`?

That might be clearer for someone who doesn't recognize GOLF. It's a little long, but it wouldn't be the longest GOEXPERIMENT. (For example, current GOEXPERIMENTs like `PreemptibleLoops`, `SyncHashTrieMap`, and `AliasTypeParams` are all more characters than `GoroutineLeak`).

Also, more often than not, I think GOEXPERIMENTs tend to be full words, though I think you have flexibility on that. (I didn't count, but skimming the current list, it's maybe full words ~2/3 of time, though it depends on how you count words like `func` and `sync`).

Or if you don't like `GoroutineLeak`, maybe you have a better idea for how to avoid the GOLF acronym here? Or maybe others will have opinions. (Or maybe this comment is just off base 😅).

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds

No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 18, 2025, 7:18:28 PMJul 18

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall and Michael Pratt

Michael Knyszek added 20 comments

Patchset-level comments

File-level comment, Patchset 5 (Latest):

Michael Knyszek . resolved

I left quite a few comments but most of them are shallow. overall, it looks great and thank you for taking the time to upstream this! I like the overall approach a lot.

the only big comment I left was about the gcMask mechanism. I left what I believe is a detailed explanation of a possible alternative approach. please feel free to reach out to me if you have any questions and we can discuss it more. I am also open to prototyping the alternative I suggested if you don't have the time.

File src/runtime/mgc.go

Line 367, Patchset 5 (Latest):

	markrootNext uint32 // next markroot job
	markrootJobs uint32 // number of markroot jobs

Michael Knyszek . unresolved

I mention this elsewhere, but I think we could just make these atomic.Uint32. I don't think they're going to be very heavily contended.

Line 386, Patchset 5 (Latest): // detection during the next GC cycle; it is set by DetectGoroutineLeaks()

Michael Knyszek . unresolved

nit: replace with spaces

Line 393, Patchset 5 (Latest): pendingGoleakDetection, detectingGoleaks, detectedGoleaks bool

Michael Knyszek . unresolved

a few nits on the representation and naming.

for one, I think this would be cleaner as a nested anonymous struct. I also think the names could be a little clearer ("detected" seems to imply that we successfully detected a leak, where really it just means "finished"). maybe something like:
```
goLeakCheck struct {
    pending bool
    enabled bool
    done    bool
}
```

"enabled" and "done" are also more consistent with naming elsewhere in the runtime.

Line 576, Patchset 5 (Latest):func FindGoLeaks() {

Michael Knyszek . unresolved

you know this but just leaving a note that this is a new API, so this we definitely can't add. this should be an unexported function that is made available to runtime/pprof (or whatever we plan to do) via linkname. (and we'll explicitly blocklist the linkname in the compiler for use outside of runtime/pprof.)

Line 582, Patchset 5 (Latest):

	// This write should be thread-safe, as the overwritten value is true.
	// pendingGoleakDetection is only set to false under STW at the start


	// of the GC cycle that picks it up.

Michael Knyszek . unresolved

I see no reason why this couldn't just be an atomic.Bool. I don't think it makes much of a difference performance-wise and it's just a little easier to reason about (removes any doubt about its synchronization).

Line 840, Patchset 5 (Latest):

			// Write is thread-safe because the world is stopped
			work.detectingGoleaks = true

Michael Knyszek . unresolved

I think you should just document detectingGoleaks to be protected by STW and remove this comment. that's more consistent with how we document other variables.

Line 946, Patchset 5 (Latest): if goexperiment.GolfGC {

Michael Knyszek . unresolved

the extra goexperiment check is unnecessary. IMO this is clearly a very cheap and non-hot-path check, so it doesn't need the compile-time branch.

Line 951, Patchset 5 (Parent): work.tMarkTerm = now

Michael Knyszek . unresolved

I think there's a bug here in that tMarkTerm might no longer be set correctly if the goexperiment is disabled. why not just leave this line and update it below?

Line 1076, Patchset 5 (Latest):

		now = nanotime()
		work.tMarkTerm = now


		// Check again whether any P needs to flush its write barrier

		// to the GC work queue.

systemstack(func() {


			for _, p := range allp {

				wbBufFlush1(p)
				if !p.gcw.empty() {
					restart = true
					break
				}
			}
		})



		// If that is the case, restart again. Once restarts are no longer needed,

		// run this without deadlock detection.

		if restart {
			gcDebugMarkDone.restartedDueTo27993 = true

			getg().m.preemptoff = ""
			systemstack(func() {


				// Accumulate the time we were stopped before we had to start again.

				work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)

				now := startTheWorldWithSema(0, stw)


				work.pauseNS += now - stw.startedStopping

			})
			semrelease(&worldsema)
			goto top
		}

Michael Knyszek . unresolved

is this necessary? intuitively I would think that it isn't. the leak detector will already restart the GC when it needs to (goto top), and this restart (anything due to issue #27993) will already have happened above if it was necessary. does something break if you remove this? if so, I suspect something else may be wrong.

Line 1144, Patchset 5 (Latest):// Check if an object is marked in the heap.

Michael Knyszek . unresolved

minor nit: for consistency, documentation should start with the name of the function.

Line 1145, Patchset 5 (Latest):func checkIfMarked(p unsafe.Pointer) bool {

Michael Knyszek . unresolved

I think this should go next to its callees in mbitmap.go (so the details all live together) and for consistency with other functions in the runtime, let's maybe call this `isMarkedOrNotInHeap`? I think that's the most clear name of what this function is checking, even though it is a mouthful.

(the details here are kind of subtle actually. just "IfMarked" doesn't say much about how it treats non-heap memory, but the caller of this function wants to treat all non-heap memory as essentially reachable.)

Line 1161, Patchset 5 (Latest):func (gp *g) maybeLive() bool {

Michael Knyszek . unresolved

I think this name is a little unclear ("live" is *very* overloaded in the runtime, as you can imagine). WDYT about inverting the condition it returns and calling it something like `isPermanentlyBlocked`?

Line 1896, Patchset 5 (Latest):

	if !work.detectingGoleaks {
		return work.markrootNext < work.markrootJobs
	}
	rootNext := atomic.Load(&work.markrootNext)
	rootJobs := atomic.Load(&work.markrootJobs)
	return rootNext < rootJobs

Michael Knyszek . unresolved

I would prefer to keep the previous structure which is more explicit, but I think we can do that (and avoid checking detectingGoleaks) if we just make work.markrootNext and work.markrootJobs atomic variables. I think it's fine. loads are cheap on most architectures and these variables are not hot.

File src/runtime/mgcmark.go

Line 55, Patchset 5 (Latest):

	gcUndoBitMask = uintptr(uintptrMask >> 2) // This constant reserves some bits of the address space for the GC to use in order to mask addresses.


	gcBitMask     = ^gcUndoBitMask            // This flips every bit in gcUndoBitMask of uinptr width

Michael Knyszek . unresolved

it occurs to me that the masking approach means that this implementation might struggle on 32-bit platforms. I mentioned this long ago, but I would still like to consider hiding the few pointers we need to from the GC some other way.

Line 59, Patchset 5 (Latest):// gcMask masks addresses that should not be automatically marked during the GC.

Michael Knyszek . unresolved

there are three problems with the gcMask mechanism that concern me.

1. it's invasive. I understand that you've ran some benchmarks and don't see any difference. that's good. if it was absolutely necessary I would be open to the idea, but it's invasive enough that it needs to be understood by every place that scans objects. this is just scanobject for now, but in Green Tea it will be more. not a huge deal, but it would be simpler to reason about these deep and hot paths in the runtime if we didn't have to think about masked pointers.

2. reserving the top 2 bits for masking is probably a no-go on 32-bit platforms. while usage on 32-bit platforms is low, it's not nothing. I think it's important that this functionality works everywhere. (again, if this was absolutely necessary, then fine, we can say only 64-bit. but I don't think that it is. see below.)

3. it's harder to reason about what the GC will and will not observe statically. right now when we see a uintptr that has a pointer stored into it, it raises alarm bells that some field may contain a pointer that the GC doesn't know about. these are typically _very_ carefully managed, and signals to the maintainers that we need to look at a variable's usage carefully. gcMask is unfortunately not a static property of a variable but a dynamic property. this means (1) we're adding a new kind of pointer-hiding we don't already do, and so aren't familiar with, and (2) variables can be sometimes masked sometimes not-masked, which is just more complicated to reason about at a baseline.

so like I said, I think we can avoid pointer masking. here's how.

if I'm understanding correctly, there are exactly 2 places where masked GC pointers are necessary. I think we can hide both of them as necessary without adding the masking mechanism. I believe this will be simpler, since one problem with the masking, besides being somewhat invasive,

the first place is allg. I think we can fill allg with guintptr instead of `*g`. if we're not detecting deadlock goroutines, I think we should simply iterate over the `allg` and manually enqueue them with the GC. when we are detecting deadlocked goroutines, we can do what your code currently does, which is create a new `[]*g` for whatever set of `g`s we want to expose to the GC. I think this is sufficient to hide the goroutines in `allg` that you need to hide.

the second place is sudog.elem, only in the syncSema case. the first question I have is why it even *is* okay to mask the elem field at all. I think the code, as written (and I left a comment to this effect) presumes that the masked sudog.elem will be picked by from somewhere else (like the goroutine's stack) but I don't think this is a guarantee. I think to avoid a future footgun, we really want to be able to presume that sudog.elem could be the last pointer to the object its pointing to.

here's an apporach that I think would work. first, I think we can change elem to simply be a uintptr. then, we can add a second field to the sudog called elemLive which is an unsafe.Pointer. in the usual codepaths, we manage both elem and elemLive together, and keep them in sync. (we can even add a new special type to wrap the two (struct{ p uintptr; live unsafe.Pointer }) to make it abundantly clear that they're always operated on together.) when it comes time to perform the goroutine leak analysis in the next GC cycle, we walk all the blocked goroutines (AFAICT we do this anyway) and clear g.waiting.elemLive (before enabling the write barrier), based on the G's waitreason. later, we perform the goroutine leak detection, and call checkIfMarked. after we have an answer, we immediately queue s.elem explicitly with the GC if it's not marked, and restore s.elemLive.

I understand this is a lot to consider, but I think it works out better, provided it works. maybe there's something I'm missing that means this doesn't work, and the full dynamism of the gcMask approach is necessary, but at the moment I don't see why it would be. I also think this approach pushes more of the complexity into the leak detector and out of the regular GC paths.

let me know what you think.

Line 173, Patchset 5 (Latest): if goexperiment.GolfGC {

Michael Knyszek . unresolved

same as elsewhere, I think the extra branch here is hurting more than it's helping in terms of readability. I would instead put `// For goexperiment.GolfGC.` next to all branches depending on `work.detectingGoleaks`

File src/runtime/runtime2.go

Line 110, Patchset 5 (Latest):

Michael Knyszek . unresolved

nit: remove newline to stay consistent with the above

File src/runtime/sema.go

Line 508, Patchset 5 (Latest): s.g.waiting = nil

Michael Knyszek . unresolved

this could use a comment

Line 631, Patchset 5 (Latest): // Storing this pointer is

Michael Knyszek . unresolved

incomplete comment

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Pratt

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 18, 2025, 8:58:17 PMJul 18

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall and Michael Pratt

Michael Knyszek added 1 comment

Patchset-level comments

File-level comment, Patchset 5 (Latest):

Michael Knyszek . unresolved

also, this change needs tests. I think the best thing to do would be to use src/runtime/testdata/testprog to create leaked goroutines in subprocesses and then check the output from the runtime tests. (look for tests that call `runTestProg`. they invoke a particular test in the test program by passing a particular command line parameter, which usually maps to the name of the function that is the entry point in the testprog binary.)

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 21, 2025, 10:13:00 AMJul 21

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #6 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 21, 2025, 10:14:33 AMJul 21

to Gerrit Bot, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 18 comments

Patchset-level comments

File-level comment, Patchset 5:

t hepudds . resolved

Hi Vlad, very exciting to see! Are you planning on linking to https://github.com/VladSaiocUber/go-gc-proposal either here in the CL description and/or on #74609? That link seems to provide a more concrete overview than the current summary at #74609 (though maybe that link is out of sync or something, or maybe there is some other reason you haven't linked it yet... or maybe I just missed it).

Georgian-Vlad Saioc

The proposal is still under construction, but I will check it in to https://github.com/golang/proposal, so that we work it over.

File src/internal/goexperiment/flags.go

Line 134, Patchset 5: GolfGC bool

t hepudds . resolved

Is the "F" in "GOLF" accurate for this CL and proposal? My initial impression was the "Fixer" part of the GOLF paper is effectively not part of this proposal, but maybe I misunderstood.
Rather than using `GolfGC` as the GOEXPERIMENT name, one idea would be to instead just spell it out, such as maybe `GoroutineLeak` and hence used as `GOEXPERIMENT=goroutineleak`?
That might be clearer for someone who doesn't recognize GOLF. It's a little long, but it wouldn't be the longest GOEXPERIMENT. (For example, current GOEXPERIMENTs like `PreemptibleLoops`, `SyncHashTrieMap`, and `AliasTypeParams` are all more characters than `GoroutineLeak`).
Also, more often than not, I think GOEXPERIMENTs tend to be full words, though I think you have flexibility on that. (I didn't count, but skimming the current list, it's maybe full words ~2/3 of time, though it depends on how you count words like `func` and `sync`).
Or if you don't like `GoroutineLeak`, maybe you have a better idea for how to avoid the GOLF acronym here? Or maybe others will have opinions. (Or maybe this comment is just off base 😅).

Georgian-Vlad Saioc

The "F" in this version would instead be "Finder". That way we can save the acronym, but, since the length is not a problem, I'm all for `GoroutineLeakFinderGC`. It doesn't roll off the tongue, but is transparent in what it does.

P.S. It just occurred to me that, as spunky names go, we could also call it GoldGC (D for Detection) 😎. (Though, if we want to eventually allow for reclaim under special circumstances eventually, e.g., as a Hail Mary approach to try to prevent OOM, we have to pivot back to Golf)

File src/runtime/mgc.go

Line 367, Patchset 5: markrootNext uint32 // next markroot job


	markrootJobs uint32 // number of markroot jobs

Michael Knyszek . unresolved

I mention this elsewhere, but I think we could just make these atomic.Uint32. I don't think they're going to be very heavily contended.

Georgian-Vlad Saioc

Them being accessed atomically is actually a holdover from an older iteration where each mark worker could concurrently expand the set of roots, but now that the check for additional non-leaked goroutines is only performed in `gcMarkDone` by the last GC worker, I'm trying make the access to markrootNext and markrootJobs non-atomic. The problem is that behaviour is still unstable when implementing those changes. AFAICT there seems to be a race that causes some roots to end up unmarked, leading to zombies. However, I unfortunately haven't yet figured out the source of the problem.

Line 386, Patchset 5: // detection during the next GC cycle; it is set by DetectGoroutineLeaks()

Michael Knyszek . resolved

nit: replace with spaces

Georgian-Vlad Saioc

They should have been, but seem to have been "tabified" by git 😕

Line 393, Patchset 5: pendingGoleakDetection, detectingGoleaks, detectedGoleaks bool

Michael Knyszek . resolved

a few nits on the representation and naming.

for one, I think this would be cleaner as a nested anonymous struct. I also think the names could be a little clearer ("detected" seems to imply that we successfully detected a leak, where really it just means "finished"). maybe something like:
```
goLeakCheck struct {
    pending bool
    enabled bool
    done    bool
}
```

"enabled" and "done" are also more consistent with naming elsewhere in the runtime.

Georgian-Vlad Saioc

Acknowledged

Line 582, Patchset 5: // This write should be thread-safe, as the overwritten value is true.


	// pendingGoleakDetection is only set to false under STW at the start
	// of the GC cycle that picks it up.

Michael Knyszek . resolved

I see no reason why this couldn't just be an atomic.Bool. I don't think it makes much of a difference performance-wise and it's just a little easier to reason about (removes any doubt about its synchronization).

Georgian-Vlad Saioc

Acknowledged

Line 840, Patchset 5:

			// Write is thread-safe because the world is stopped
			work.detectingGoleaks = true

Michael Knyszek . resolved

I think you should just document detectingGoleaks to be protected by STW and remove this comment. that's more consistent with how we document other variables.

Georgian-Vlad Saioc

Done

Line 946, Patchset 5: if goexperiment.GolfGC {

Michael Knyszek . resolved

the extra goexperiment check is unnecessary. IMO this is clearly a very cheap and non-hot-path check, so it doesn't need the compile-time branch.

Georgian-Vlad Saioc

Done

Line 951, Patchset 5 (Parent): work.tMarkTerm = now

Michael Knyszek . resolved

I think there's a bug here in that tMarkTerm might no longer be set correctly if the goexperiment is disabled. why not just leave this line and update it below?

Georgian-Vlad Saioc

Done

Line 1076, Patchset 5: now = nanotime()


		work.tMarkTerm = now
		// Check again whether any P needs to flush its write barrier
		// to the GC work queue.
		systemstack(func() {
			for _, p := range allp {
				wbBufFlush1(p)
				if !p.gcw.empty() {
					restart = true
					break
				}
			}
		})

		// If that is the case, restart again. Once restarts are no longer needed,
		// run this without deadlock detection.
		if restart {
			gcDebugMarkDone.restartedDueTo27993 = true

			getg().m.preemptoff = ""
			systemstack(func() {
				// Accumulate the time we were stopped before we had to start again.
				work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)

				now := startTheWorldWithSema(0, stw)
				work.pauseNS += now - stw.startedStopping
			})
			semrelease(&worldsema)
			goto top
		}

Michael Knyszek . resolved

is this necessary? intuitively I would think that it isn't. the leak detector will already restart the GC when it needs to (goto top), and this restart (anything due to issue #27993) will already have happened above if it was necessary. does something break if you remove this? if so, I suspect something else may be wrong.

Georgian-Vlad Saioc

Good catch! This is a holdover from back when we did the reclaim.

Line 1144, Patchset 5:// Check if an object is marked in the heap.

Michael Knyszek . resolved

minor nit: for consistency, documentation should start with the name of the function.

Georgian-Vlad Saioc

Done

Line 1145, Patchset 5:func checkIfMarked(p unsafe.Pointer) bool {

Michael Knyszek . resolved

I think this should go next to its callees in mbitmap.go (so the details all live together) and for consistency with other functions in the runtime, let's maybe call this `isMarkedOrNotInHeap`? I think that's the most clear name of what this function is checking, even though it is a mouthful.
(the details here are kind of subtle actually. just "IfMarked" doesn't say much about how it treats non-heap memory, but the caller of this function wants to treat all non-heap memory as essentially reachable.)

Georgian-Vlad Saioc

Done

Line 1161, Patchset 5:func (gp *g) maybeLive() bool {

Michael Knyszek . unresolved

I think this name is a little unclear ("live" is *very* overloaded in the runtime, as you can imagine). WDYT about inverting the condition it returns and calling it something like `isPermanentlyBlocked`?

Georgian-Vlad Saioc

Agreed on the term `live`, this is a holdover from the paper terminology. But `isPermanentlyBlocked` is probably not the best name. While `maybeLive` _eventually_ returns `true` for all semantically runnable goroutines, it does return `false` for some of them towards the beginning of the GC cycle, before it reached a fixed point over reachable memory (and semantically runnable Gs). I worry that `isPermanentlyBlocked` may confuse maintainers because of how much it sounds like a final verdict.

I think a good name should capture the notion that `true` means the goroutine may still be runnable (according to GOLF, at least), while `false` may either mean that the goroutine is definitely not runnable OR we simply haven't figured it out yet. What about something a bit looser, like `maybeRunnable` or `checkIfMaybeRunnable`?

P.S. It's frustrating that, even though everything is so simple to understand, it's hard to come up with good, short names for things 😕.

Line 1896, Patchset 5: if !work.detectingGoleaks {


		return work.markrootNext < work.markrootJobs
	}
	rootNext := atomic.Load(&work.markrootNext)
	rootJobs := atomic.Load(&work.markrootJobs)
	return rootNext < rootJobs

Michael Knyszek . resolved

I would prefer to keep the previous structure which is more explicit, but I think we can do that (and avoid checking detectingGoleaks) if we just make work.markrootNext and work.markrootJobs atomic variables. I think it's fine. loads are cheap on most architectures and these variables are not hot.

Georgian-Vlad Saioc

Ah, I see now. It's because of the comment, right? `root scan work available`.
I was a bit confused initially as to why the returned booleans were made explicit. Restored.

File src/runtime/mgcmark.go

Line 55, Patchset 5:

	gcUndoBitMask = uintptr(uintptrMask >> 2) // This constant reserves some bits of the address space for the GC to use in order to mask addresses.


	gcBitMask     = ^gcUndoBitMask            // This flips every bit in gcUndoBitMask of uinptr width

Michael Knyszek . unresolved

it occurs to me that the masking approach means that this implementation might struggle on 32-bit platforms. I mentioned this long ago, but I would still like to consider hiding the few pointers we need to from the GC some other way.

Georgian-Vlad Saioc

What about adjusting the offset to 1 bit for 32-bit systems (or, less ideally, but if push comes to shove, just disabling this altogether for 32-bit systems).

File src/runtime/runtime2.go

Line 110, Patchset 5:

Michael Knyszek . resolved

nit: remove newline to stay consistent with the above

Georgian-Vlad Saioc

Done

File src/runtime/sema.go

Line 508, Patchset 5: s.g.waiting = nil

Michael Knyszek . resolved

this could use a comment

Georgian-Vlad Saioc

Done

Line 631, Patchset 5: // Storing this pointer is

Michael Knyszek . resolved

incomplete comment

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 22, 2025, 11:01:09 AMJul 22

to Gerrit Bot, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 6 (Latest):

Georgian-Vlad Saioc . resolved

@mkny...@google.com I am trying the approach you suggested with tristate pointers in order to hide addresses from the GC, but I am encountering some significant hurdles (*g objects getting marked in mysterious ways, and whatever channels they transitively reference, even when leaked, or, when dealing with semas in the *sudog semtable, cryptic crashes caused by referencing unreserved spans).

It is also worth noting that changing the types of *sudog fields and allgs require touching a lot of more of the runtime code than is comfortable (and can be potentially risky, depending on what assumptions are made).

I am going to keep at it for a bit longer and see whether I can work out, but would also propose an alternative approach, in case this becomes intractable:

In order to curtail the cost of the bitmask check on the hotpath, instead dynamically change GC behaviour, e.g., by providing it with a different implementation of scanobject that does the bitmasking check.

There is a maintenance burden introduced here, but there are scant few occurrences of this throughout the runtime (e.g., comments in gcDrain and gcDrainN)

For 32-bit architectures, either:
a. (less ideally but easier to implement) disallow the goroutine leak check altogether. This can simply be achieved with some tricks such as changing how the bitmask is computed and having it settled to not occurring. Then, upon triggering goroutine leak detection in the GC, a warning is issued on 32-bit systems saying it cannot be performed.
b. (better option) allow the goroutine leak detection (maybe use a 1-bit mask only to double the available addresses) but perform a memory utilization check beforehand. If memory used falls below the threshold, allow it. Otherwise issue a warning stating that memory is too low for the leak check.

The only major downside: larger binary sizes because of duplicated code.
Do you think this approach is better?

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 22, 2025, 3:49:33 PMJul 22

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

Patchset-level comments

File-level comment, Patchset 6 (Latest):

Georgian-Vlad Saioc . resolved

@mkny...@google.com I am trying the approach you suggested with tristate pointers in order to hide addresses from the GC, but I am encountering some significant hurdles (*g objects getting marked in mysterious ways, and whatever channels they transitively reference, even when leaked, or, when dealing with semas in the *sudog semtable, cryptic crashes caused by referencing unreserved spans).
It is also worth noting that changing the types of *sudog fields and allgs require touching a lot of more of the runtime code than is comfortable (and can be potentially risky, depending on what assumptions are made).
I am going to keep at it for a bit longer and see whether I can work out, but would also propose an alternative approach, in case this becomes intractable:
In order to curtail the cost of the bitmask check on the hotpath, instead dynamically change GC behaviour, e.g., by providing it with a different implementation of scanobject that does the bitmasking check.
There is a maintenance burden introduced here, but there are scant few occurrences of this throughout the runtime (e.g., comments in gcDrain and gcDrainN)
For 32-bit architectures, either:
a. (less ideally but easier to implement) disallow the goroutine leak check altogether. This can simply be achieved with some tricks such as changing how the bitmask is computed and having it settled to not occurring. Then, upon triggering goroutine leak detection in the GC, a warning is issued on 32-bit systems saying it cannot be performed.
b. (better option) allow the goroutine leak detection (maybe use a 1-bit mask only to double the available addresses) but perform a memory utilization check beforehand. If memory used falls below the threshold, allow it. Otherwise issue a warning stating that memory is too low for the leak check.
The only major downside: larger binary sizes because of duplicated code.
Do you think this approach is better?

Michael Knyszek

I don't think having a second copy of scanobject addresses my core concerns. we'd need a second copy of all the green tea stuff as well.

I'm looking at this more closely now, and I think you're right that changing allg might be too much, and too error-prone -- apologies for that. however, I did start updating the sudog fields, and that seems pretty tractable, especially with a dedicated type.

if I'm following this correctly, is the primary issue scanning allg that it reaches the *hchan via the sudog? does that match your understanding?

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Pratt
t hepudds

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 22, 2025, 6:04:15 PMJul 22

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

Patchset-level comments

File-level comment, Patchset 6 (Latest):

Georgian-Vlad Saioc . unresolved

@mkny...@google.com I am trying the approach you suggested with tristate pointers in order to hide addresses from the GC, but I am encountering some significant hurdles (*g objects getting marked in mysterious ways, and whatever channels they transitively reference, even when leaked, or, when dealing with semas in the *sudog semtable, cryptic crashes caused by referencing unreserved spans).
It is also worth noting that changing the types of *sudog fields and allgs require touching a lot of more of the runtime code than is comfortable (and can be potentially risky, depending on what assumptions are made).
I am going to keep at it for a bit longer and see whether I can work out, but would also propose an alternative approach, in case this becomes intractable:
In order to curtail the cost of the bitmask check on the hotpath, instead dynamically change GC behaviour, e.g., by providing it with a different implementation of scanobject that does the bitmasking check.
There is a maintenance burden introduced here, but there are scant few occurrences of this throughout the runtime (e.g., comments in gcDrain and gcDrainN)
For 32-bit architectures, either:
a. (less ideally but easier to implement) disallow the goroutine leak check altogether. This can simply be achieved with some tricks such as changing how the bitmask is computed and having it settled to not occurring. Then, upon triggering goroutine leak detection in the GC, a warning is issued on 32-bit systems saying it cannot be performed.
b. (better option) allow the goroutine leak detection (maybe use a 1-bit mask only to double the available addresses) but perform a memory utilization check beforehand. If memory used falls below the threshold, allow it. Otherwise issue a warning stating that memory is too low for the leak check.
The only major downside: larger binary sizes because of duplicated code.
Do you think this approach is better?

Michael Knyszek

I don't think having a second copy of scanobject addresses my core concerns. we'd need a second copy of all the green tea stuff as well.
I'm looking at this more closely now, and I think you're right that changing allg might be too much, and too error-prone -- apologies for that. however, I did start updating the sudog fields, and that seems pretty tractable, especially with a dedicated type.
if I'm following this correctly, is the primary issue scanning allg that it reaches the *hchan via the sudog? does that match your understanding?

Michael Knyszek

here's my attempt to solve this problem: https://go.dev/cl/689696

it seems to work for at least simple cases. I haven't tried it yet on more complicated cases.

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 24, 2025, 6:23:11 AMJul 24

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #7 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall

Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 24, 2025, 7:58:38 AMJul 24

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #8 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 25, 2025, 5:57:49 AMJul 25

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #9 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 25, 2025, 7:39:07 AMJul 25

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #10 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 25, 2025, 7:51:59 AMJul 25

to Gerrit Bot, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 5 comments

Patchset-level comments

File-level comment, Patchset 6:

Georgian-Vlad Saioc . resolved

@mkny...@google.com I am trying the approach you suggested with tristate pointers in order to hide addresses from the GC, but I am encountering some significant hurdles (*g objects getting marked in mysterious ways, and whatever channels they transitively reference, even when leaked, or, when dealing with semas in the *sudog semtable, cryptic crashes caused by referencing unreserved spans).
It is also worth noting that changing the types of *sudog fields and allgs require touching a lot of more of the runtime code than is comfortable (and can be potentially risky, depending on what assumptions are made).
I am going to keep at it for a bit longer and see whether I can work out, but would also propose an alternative approach, in case this becomes intractable:
In order to curtail the cost of the bitmask check on the hotpath, instead dynamically change GC behaviour, e.g., by providing it with a different implementation of scanobject that does the bitmasking check.
There is a maintenance burden introduced here, but there are scant few occurrences of this throughout the runtime (e.g., comments in gcDrain and gcDrainN)
For 32-bit architectures, either:
a. (less ideally but easier to implement) disallow the goroutine leak check altogether. This can simply be achieved with some tricks such as changing how the bitmask is computed and having it settled to not occurring. Then, upon triggering goroutine leak detection in the GC, a warning is issued on 32-bit systems saying it cannot be performed.
b. (better option) allow the goroutine leak detection (maybe use a 1-bit mask only to double the available addresses) but perform a memory utilization check beforehand. If memory used falls below the threshold, allow it. Otherwise issue a warning stating that memory is too low for the leak check.
The only major downside: larger binary sizes because of duplicated code.
Do you think this approach is better?

Michael Knyszek

I don't think having a second copy of scanobject addresses my core concerns. we'd need a second copy of all the green tea stuff as well.
I'm looking at this more closely now, and I think you're right that changing allg might be too much, and too error-prone -- apologies for that. however, I did start updating the sudog fields, and that seems pretty tractable, especially with a dedicated type.
if I'm following this correctly, is the primary issue scanning allg that it reaches the *hchan via the sudog? does that match your understanding?

Michael Knyszek

here's my attempt to solve this problem: https://go.dev/cl/689696
it seems to work for at least simple cases. I haven't tried it yet on more complicated cases.

Georgian-Vlad Saioc

Done

File src/runtime/mgc.go

Line 367, Patchset 5:

	markrootNext uint32 // next markroot job
	markrootJobs uint32 // number of markroot jobs

Michael Knyszek . resolved

I mention this elsewhere, but I think we could just make these atomic.Uint32. I don't think they're going to be very heavily contended.

Georgian-Vlad Saioc

Them being accessed atomically is actually a holdover from an older iteration where each mark worker could concurrently expand the set of roots, but now that the check for additional non-leaked goroutines is only performed in `gcMarkDone` by the last GC worker, I'm trying make the access to markrootNext and markrootJobs non-atomic. The problem is that behaviour is still unstable when implementing those changes. AFAICT there seems to be a race that causes some roots to end up unmarked, leading to zombies. However, I unfortunately haven't yet figured out the source of the problem.

Georgian-Vlad Saioc

Done

Line 1161, Patchset 5:func (gp *g) maybeLive() bool {

Michael Knyszek . resolved

I think this name is a little unclear ("live" is *very* overloaded in the runtime, as you can imagine). WDYT about inverting the condition it returns and calling it something like `isPermanentlyBlocked`?

Georgian-Vlad Saioc

Agreed on the term `live`, this is a holdover from the paper terminology. But `isPermanentlyBlocked` is probably not the best name. While `maybeLive` _eventually_ returns `true` for all semantically runnable goroutines, it does return `false` for some of them towards the beginning of the GC cycle, before it reached a fixed point over reachable memory (and semantically runnable Gs). I worry that `isPermanentlyBlocked` may confuse maintainers because of how much it sounds like a final verdict.
I think a good name should capture the notion that `true` means the goroutine may still be runnable (according to GOLF, at least), while `false` may either mean that the goroutine is definitely not runnable OR we simply haven't figured it out yet. What about something a bit looser, like `maybeRunnable` or `checkIfMaybeRunnable`?
P.S. It's frustrating that, even though everything is so simple to understand, it's hard to come up with good, short names for things 😕.

Georgian-Vlad Saioc

Done

File src/runtime/mgcmark.go

Line 55, Patchset 5:

	gcUndoBitMask = uintptr(uintptrMask >> 2) // This constant reserves some bits of the address space for the GC to use in order to mask addresses.
	gcBitMask     = ^gcUndoBitMask            // This flips every bit in gcUndoBitMask of uinptr width

Michael Knyszek . resolved

it occurs to me that the masking approach means that this implementation might struggle on 32-bit platforms. I mentioned this long ago, but I would still like to consider hiding the few pointers we need to from the GC some other way.

Georgian-Vlad Saioc

What about adjusting the offset to 1 bit for 32-bit systems (or, less ideally, but if push comes to shove, just disabling this altogether for 32-bit systems).

Georgian-Vlad Saioc

Done

Line 59, Patchset 5:// gcMask masks addresses that should not be automatically marked during the GC.

Michael Knyszek . resolved

there are three problems with the gcMask mechanism that concern me.
1. it's invasive. I understand that you've ran some benchmarks and don't see any difference. that's good. if it was absolutely necessary I would be open to the idea, but it's invasive enough that it needs to be understood by every place that scans objects. this is just scanobject for now, but in Green Tea it will be more. not a huge deal, but it would be simpler to reason about these deep and hot paths in the runtime if we didn't have to think about masked pointers.
2. reserving the top 2 bits for masking is probably a no-go on 32-bit platforms. while usage on 32-bit platforms is low, it's not nothing. I think it's important that this functionality works everywhere. (again, if this was absolutely necessary, then fine, we can say only 64-bit. but I don't think that it is. see below.)
3. it's harder to reason about what the GC will and will not observe statically. right now when we see a uintptr that has a pointer stored into it, it raises alarm bells that some field may contain a pointer that the GC doesn't know about. these are typically _very_ carefully managed, and signals to the maintainers that we need to look at a variable's usage carefully. gcMask is unfortunately not a static property of a variable but a dynamic property. this means (1) we're adding a new kind of pointer-hiding we don't already do, and so aren't familiar with, and (2) variables can be sometimes masked sometimes not-masked, which is just more complicated to reason about at a baseline.
so like I said, I think we can avoid pointer masking. here's how.
if I'm understanding correctly, there are exactly 2 places where masked GC pointers are necessary. I think we can hide both of them as necessary without adding the masking mechanism. I believe this will be simpler, since one problem with the masking, besides being somewhat invasive,
the first place is allg. I think we can fill allg with guintptr instead of `*g`. if we're not detecting deadlock goroutines, I think we should simply iterate over the `allg` and manually enqueue them with the GC. when we are detecting deadlocked goroutines, we can do what your code currently does, which is create a new `[]*g` for whatever set of `g`s we want to expose to the GC. I think this is sufficient to hide the goroutines in `allg` that you need to hide.
the second place is sudog.elem, only in the syncSema case. the first question I have is why it even *is* okay to mask the elem field at all. I think the code, as written (and I left a comment to this effect) presumes that the masked sudog.elem will be picked by from somewhere else (like the goroutine's stack) but I don't think this is a guarantee. I think to avoid a future footgun, we really want to be able to presume that sudog.elem could be the last pointer to the object its pointing to.
here's an apporach that I think would work. first, I think we can change elem to simply be a uintptr. then, we can add a second field to the sudog called elemLive which is an unsafe.Pointer. in the usual codepaths, we manage both elem and elemLive together, and keep them in sync. (we can even add a new special type to wrap the two (struct{ p uintptr; live unsafe.Pointer }) to make it abundantly clear that they're always operated on together.) when it comes time to perform the goroutine leak analysis in the next GC cycle, we walk all the blocked goroutines (AFAICT we do this anyway) and clear g.waiting.elemLive (before enabling the write barrier), based on the G's waitreason. later, we perform the goroutine leak detection, and call checkIfMarked. after we have an answer, we immediately queue s.elem explicitly with the GC if it's not marked, and restore s.elemLive.
I understand this is a lot to consider, but I think it works out better, provided it works. maybe there's something I'm missing that means this doesn't work, and the full dynamism of the gcMask approach is necessary, but at the moment I don't see why it would be. I also think this approach pushes more of the complexity into the leak detector and out of the regular GC paths.
let me know what you think.

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements

Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 25, 2025, 7:52:18 AMJul 25

to Gerrit Bot, goph...@pubsubhelper.golang.org, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 5:

Michael Knyszek . resolved

also, this change needs tests. I think the best thing to do would be to use src/runtime/testdata/testprog to create leaked goroutines in subprocesses and then check the output from the runtime tests. (look for tests that call `runTestProg`. they invoke a particular test in the test program by passing a particular command line parameter, which usually maps to the name of the function that is the entry point in the testprog binary.)

Georgian-Vlad Saioc

Done

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 25, 2025, 7:56:01 AMJul 25

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #11 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 25, 2025, 8:03:40 AMJul 25

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #12 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Jul 25, 2025, 8:05:56 AMJul 25

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 25, 2025, 8:16:49 AMJul 25

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Michael Knyszek, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

File src/runtime/mgcmark.go

Line 173, Patchset 5: if goexperiment.GolfGC {

Michael Knyszek . resolved

same as elsewhere, I think the extra branch here is hurting more than it's helping in terms of readability. I would instead put `// For goexperiment.GolfGC.` next to all branches depending on `work.detectingGoleaks`

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 25, 2025, 9:07:17 AMJul 25

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #13 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 28, 2025, 12:51:16 PMJul 28

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

File src/runtime/mgc.go

Line 739, Patchset 13 (Latest): mode = gcForceBlockMode

Michael Knyszek . unresolved

IIRC gcForceBlockMode is overkill. you need the mark phase to be STW, but this also makes the sweep phase STW. I think gcForceMode might be sufficient, unless I'm forgetting some other detail.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Jul 29, 2025, 11:25:12 AMJul 29

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

File src/runtime/mgc.go

Line 739, Patchset 13 (Latest): mode = gcForceBlockMode

Michael Knyszek . unresolved

IIRC gcForceBlockMode is overkill. you need the mark phase to be STW, but this also makes the sweep phase STW. I think gcForceMode might be sufficient, unless I'm forgetting some other detail.

Georgian-Vlad Saioc

Acknowledged

On a different note, I found a bug when switching between concurrent and STW-GC on the fly (and this applies to the Go runtime in general, not just goroutine leak GC; it is even reproducible for Go1.22.5 without any Golf changes). Consider the following sequence:

1. Concurrent GC is running concurrently with user code, which triggers GC assists.

2. During the sweeping phase, some GC assists call gcMarkDone, once of which acquires the work.markDoneSema

3. A goroutine leak (or just mark-STW) GC cycle is initiated. This will preempt all goroutines, including the GC assists.

4. The assist which acquired work.markDoneSema is preempted mid-release of work.markDoneSema. This can happen because the GC assists increment work.nwait before calling gcMarkDone, meaning the GC may start transitioning into sweeping and starting a new GC cycle while many assists may still be trying to clear gcMarkDone.

5. At this point, the GC assists cannot be resumed while marking because they are flagged as user code (sched.user.disable=true and the schedEnabled property returns false).

6. The GC mark workers go on with their business, but once they call gcMarkDone, they subsequently block acquiring work.markDoneSema, which has not been released by the GC assist.

7. The scheduler will run checkdead while trying to run system goroutines. But, eventually, the number of idle M's will eventually cap out (passing the run == 0 check).

8. However, we will actually not get a global deadlock report! Instead, because some runnable (but not schedulable) user goroutines are still in the system, we will get a "checkdead: runnable g" panic.

The only reason we did not discover this earlier is because all GC cycles were uniform (set by GODEBUG flag) also in our paper variant. It's only the opt-in approach we added that triggers this (and, even there, it has that Heisenbug flavor).

I suppose the way to correct this is, upon initiating a goroutine leak GC cycle, to force every GC assist to clear the critical section in gcMarkDone protected by work.markDoneSema.

However, I think it's best we coordinate on this one, in order to get as clean a solution as we can. If you think this should go in a separate change, please let me know.

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Jul 30, 2025, 5:30:17 AMJul 30

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #14 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 31, 2025, 9:31:08 PMJul 31

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

File src/runtime/mgc.go

Line 735, Patchset 14 (Latest):

	} else if work.goroutineLeakFinder.pending.Load() || debug.gcgoroutineleaks > 0 {
		// If goroutine leak detection has been enabled (via GODEBUG=gcgoroutineleaks=1),
		// or via profiling, stop the world during the marking phase.
		mode = gcForceMode

Michael Knyszek . unresolved

hm, actually... taking a step back... why is this necessary at all? basically all the work your algorithm does is during the STW anyway... is it actually necessary?

what is it doing while the GC is actively running concurrently?

the roots are updated while the world is already stopped
the algorithm is run while the world is already stopped
untracking the sync objects happens while the world is stopped

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 31, 2025, 9:32:39 PMJul 31

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

File src/runtime/mgc.go

Line 739, Patchset 13: mode = gcForceBlockMode

Michael Knyszek . unresolved

IIRC gcForceBlockMode is overkill. you need the mark phase to be STW, but this also makes the sweep phase STW. I think gcForceMode might be sufficient, unless I'm forgetting some other detail.

Georgian-Vlad Saioc

Acknowledged
On a different note, I found a bug when switching between concurrent and STW-GC on the fly (and this applies to the Go runtime in general, not just goroutine leak GC; it is even reproducible for Go1.22.5 without any Golf changes). Consider the following sequence:
1. Concurrent GC is running concurrently with user code, which triggers GC assists.
2. During the sweeping phase, some GC assists call gcMarkDone, once of which acquires the work.markDoneSema
3. A goroutine leak (or just mark-STW) GC cycle is initiated. This will preempt all goroutines, including the GC assists.
4. The assist which acquired work.markDoneSema is preempted mid-release of work.markDoneSema. This can happen because the GC assists increment work.nwait before calling gcMarkDone, meaning the GC may start transitioning into sweeping and starting a new GC cycle while many assists may still be trying to clear gcMarkDone.
5. At this point, the GC assists cannot be resumed while marking because they are flagged as user code (sched.user.disable=true and the schedEnabled property returns false).
6. The GC mark workers go on with their business, but once they call gcMarkDone, they subsequently block acquiring work.markDoneSema, which has not been released by the GC assist.
7. The scheduler will run checkdead while trying to run system goroutines. But, eventually, the number of idle M's will eventually cap out (passing the run == 0 check).
8. However, we will actually not get a global deadlock report! Instead, because some runnable (but not schedulable) user goroutines are still in the system, we will get a "checkdead: runnable g" panic.
The only reason we did not discover this earlier is because all GC cycles were uniform (set by GODEBUG flag) also in our paper variant. It's only the opt-in approach we added that triggers this (and, even there, it has that Heisenbug flavor).
I suppose the way to correct this is, upon initiating a goroutine leak GC cycle, to force every GC assist to clear the critical section in gcMarkDone protected by work.markDoneSema.
However, I think it's best we coordinate on this one, in order to get as clean a solution as we can. If you think this should go in a separate change, please let me know.

Michael Knyszek

I think you found a real bug, but see my other comment. is it actually necessary to set `mode = gcForceMode` at all? AFAICT the entire algorithm runs with the world stopped anyway.

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 31, 2025, 9:57:03 PMJul 31

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 11 comments

Patchset-level comments

File-level comment, Patchset 14 (Latest):

Michael Knyszek . resolved

looking good overall! thanks for adding the tests. I think we're pretty close here.

File src/runtime/mgc.go

Line 1098, Patchset 14 (Latest):// checkIfMaybeRunnable checks whether a goroutine may still be semantically runnable.

Michael Knyszek . unresolved

nit: 'checkIf' doesn't follow the current conventions we're trying to follow. maybe just 'isMaybeRunnable'? a little shorter, and says the same thing.

Line 1132, Patchset 14 (Latest): return isMarkedOrNotInHeap(gp.waiting.elem.get())

Michael Knyszek . unresolved

I think we may need to explicitly queue these with the GC if we discover them to be dead.

here's my thinking. imagine the following scenario.

1. we discover leaked goroutines. we do not mark their elements.
2. the mark phase completes, and we begin the sweep phase.
3. during the sweep phase, these objects are freed.
4. these objects are reallocated (same address).

(4) is problematic for a few reasons.

future analysis finds that the object *is* marked. the goroutine is no longer "leaked".
future allocations allocate *another* sync primitive to the same address. I worry about what could happen if the primitive occupies the same spot in the sema trees, which is indexed by address.

Line 1189, Patchset 14 (Latest):

//
//go:nosplit

Michael Knyszek . unresolved

in general we want to avoid the nosplit annotation if we can't point to a concrete reason for needing it. I don't think it's needed here; please remove.

Line 1224, Patchset 14 (Latest): casgstatus(gp, _Gwaiting, _Gleaked)

Michael Knyszek . resolved

is this mainly so it's obvious in goroutine profiles?

File src/runtime/mgcmark.go

Line 66, Patchset 14 (Latest):// The world must be stopped or allglock must be held.

Michael Knyszek . unresolved

nit: in terms of style, please start the doc comment with "allGsSnapshotSortedForGC takes a snapshot of allgs, ..."

Line 1198, Patchset 14 (Latest):func gcUpdateMarkrootNext() (uint32, bool) {

Michael Knyszek . unresolved

nit: this could use a comment explaining the returned results, and the fact that the caller *must* mark the obtained root index (they take ownership of it).

Line 1198, Patchset 14 (Latest):func gcUpdateMarkrootNext() (uint32, bool) {

Michael Knyszek . unresolved

nit: this is actually taking ownership of the next root to mark. maybe "gcNextMarkRoot"?

Line 1205, Patchset 14 (Latest): success = work.markrootNext.CompareAndSwap(next, next+1)

Michael Knyszek . unresolved

I think you can implement this as an atomic add *if* findGoLeaks resets markrootNext to markrootJobs (before the add) as well. this way it's totally fine if markrootNext temporarily exceeds markrootJobs. also, the add is wait-free, unlike this CAS loop.

Line 1284, Patchset 14 (Latest): job, success := gcUpdateMarkrootNext()

Michael Knyszek . unresolved

I know we use success in some places, but the prevailing idiom in newer Go code is to call this bool "ok," so please switch uses to that.

Line 1432, Patchset 14 (Latest): job, success := gcUpdateMarkrootNext()

Michael Knyszek . unresolved

same here

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Jul 31, 2025, 10:02:02 PMJul 31

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek voted Commit-Queue+1

Commit-Queue

+1

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 1, 2025, 11:59:57 AMAug 1

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #15 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 1, 2025, 12:29:49 PMAug 1

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 1, 2025, 2:51:55 PMAug 1

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds added 1 comment

File src/runtime/mgc.go

Line 576, Patchset 5:func FindGoLeaks() {

Michael Knyszek . unresolved

you know this but just leaving a note that this is a new API, so this we definitely can't add. this should be an unexported function that is made available to runtime/pprof (or whatever we plan to do) via linkname. (and we'll explicitly blocklist the linkname in the compiler for use outside of runtime/pprof.)

t hepudds

Hi @vsa...@uber.com, I know this is still WIP, but I'm curious if you are currently using `FindGoLeaks` anywhere, vs. if it could be unexported or removed for now?

That can happen later of course, but I'll confess it would be exciting to see more of the builders go green with passing tests, and it might be easier for you to see at a glance which failures are actually interesting (e.g., it looks like currently the race detector is complaining about some of the tests), vs. right now `FindGoLeaks` being exported is one of the things the builders are complaining about.

If for example you are using `FindGoLeaks` for test functions that are not yet published in this CL, in addition to what Michael said above, another option might be to export it just for testing via the `runtime/export_test.go` file (which as it says there -- `Export guts for testing.`) or similar. Alternatively, it can be pretty easy for the runtime to "push" an unexported symbol so that it is available in another package, with this as an concrete example that might make sense to emulate (this is the runtime pushing an unexported `newobject` runtime function into a separate `maps` package):

There are multiple ways to do linknames, and I'm not sure which one is most appropriate for you (especially for the final version), but wanted to at least leave a pointer to an example I had followed in the past.

In any event, maybe this would be a distraction right now, and depending on what you are using `FindGoLeaks` for currently, some or all of what I wrote above might not be relevant.

(Also, maybe it's actually obvious how you are using `FindGoLeaks` currently, and maybe I just missed it 🙃).

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 4, 2025, 4:03:16 AMAug 4

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #16 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 4, 2025, 4:11:22 AMAug 4

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #17 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 4, 2025, 4:37:30 AMAug 4

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 10 comments

Patchset-level comments

File-level comment, Patchset 17 (Latest):

Georgian-Vlad Saioc . resolved

I added our microbenchmarks as part of the test suite to constantly check for stability and accuracy of the GC. At the moment, all of them are included, which might be excessive, so please let me know if you prefer to trim them down.

File src/runtime/mgc.go

Line 735, Patchset 14: } else if work.goroutineLeakFinder.pending.Load() || debug.gcgoroutineleaks > 0 {


		// If goroutine leak detection has been enabled (via GODEBUG=gcgoroutineleaks=1),
		// or via profiling, stop the world during the marking phase.
		mode = gcForceMode

Michael Knyszek . unresolved

hm, actually... taking a step back... why is this necessary at all? basically all the work your algorithm does is during the STW anyway... is it actually necessary?
what is it doing while the GC is actively running concurrently?
the roots are updated while the world is already stopped
the algorithm is run while the world is already stopped
untracking the sync objects happens while the world is stopped

Georgian-Vlad Saioc

The paper version ran concurrent GC, meaning it should be doable, but I cannot vouch yet for stability with all the other changes we made since then. I will keep you posted on the progress.

Line 1098, Patchset 14:// checkIfMaybeRunnable checks whether a goroutine may still be semantically runnable.

Michael Knyszek . resolved

nit: 'checkIf' doesn't follow the current conventions we're trying to follow. maybe just 'isMaybeRunnable'? a little shorter, and says the same thing.

Georgian-Vlad Saioc

Done

Line 1132, Patchset 14: return isMarkedOrNotInHeap(gp.waiting.elem.get())

Michael Knyszek . unresolved

I think we may need to explicitly queue these with the GC if we discover them to be dead.
here's my thinking. imagine the following scenario.
1. we discover leaked goroutines. we do not mark their elements.
2. the mark phase completes, and we begin the sweep phase.
3. during the sweep phase, these objects are freed.
4. these objects are reallocated (same address).
(4) is problematic for a few reasons.
future analysis finds that the object *is* marked. the goroutine is no longer "leaked".
future allocations allocate *another* sync primitive to the same address. I worry about what could happen if the primitive occupies the same spot in the sema trees, which is indexed by address.

Georgian-Vlad Saioc

I don't think queueing with the GC is necessary. On principle, the goroutine leak GC should mark all the memory the regular GC does. After detecting goroutine leaks, the GC transitions into the state s.t. leaks no longer need to be reported, and the mark phase is resumed to mark everything that is only reachable through leaked gs.

Line 1189, Patchset 14:

//
//go:nosplit

Michael Knyszek . resolved

in general we want to avoid the nosplit annotation if we can't point to a concrete reason for needing it. I don't think it's needed here; please remove.

Georgian-Vlad Saioc

Done

File src/runtime/mgcmark.go

Line 66, Patchset 14:// The world must be stopped or allglock must be held.

Michael Knyszek . resolved

nit: in terms of style, please start the doc comment with "allGsSnapshotSortedForGC takes a snapshot of allgs, ..."

Georgian-Vlad Saioc

Done

Line 1198, Patchset 14:func gcUpdateMarkrootNext() (uint32, bool) {

Michael Knyszek . resolved

nit: this could use a comment explaining the returned results, and the fact that the caller *must* mark the obtained root index (they take ownership of it).

Georgian-Vlad Saioc

Done

Line 1198, Patchset 14:func gcUpdateMarkrootNext() (uint32, bool) {

Michael Knyszek . resolved

nit: this is actually taking ownership of the next root to mark. maybe "gcNextMarkRoot"?

Georgian-Vlad Saioc

Done

Line 1284, Patchset 14: job, success := gcUpdateMarkrootNext()

Michael Knyszek . resolved

I know we use success in some places, but the prevailing idiom in newer Go code is to call this bool "ok," so please switch uses to that.

Georgian-Vlad Saioc

Done

Line 1432, Patchset 14: job, success := gcUpdateMarkrootNext()

Michael Knyszek . resolved

same here

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 4, 2025, 7:28:08 AMAug 4

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #18 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 4, 2025, 8:16:27 AMAug 4

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted TryBot-Bypass+1

TryBot-Bypass

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 4, 2025, 8:18:27 AMAug 4

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted TryBot-Bypass+0

TryBot-Bypass

+0

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 4, 2025, 8:18:31 AMAug 4

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 4, 2025, 10:44:15 AMAug 4

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 17:

Georgian-Vlad Saioc . resolved

@mkny...@google.com now that the bitmask issue has been addressed, should we just phase out the GOEXPERIMENT?

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 5, 2025, 4:46:24 AMAug 5

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #19 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 5, 2025, 6:23:07 AMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 5, 2025, 9:09:23 AMAug 5

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #20 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 5, 2025, 9:41:14 AMAug 5

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 20 (Latest):

Georgian-Vlad Saioc . resolved

@thepud...@gmail.com thanks for the help with the commit queue. I updated the tests again to further reduce the flakiness. Apologies for the back and forth on this. I try to be conservative with which tests are marked as flaky, such that the suite is as robust as possible, but it is tricky to pin them down due to the nature of the defects we are looking at.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 5, 2025, 9:47:44 AMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 5, 2025, 10:13:16 AMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds added 1 comment

Patchset-level comments

File-level comment, Patchset 20 (Latest):

Georgian-Vlad Saioc . resolved

@thepud...@gmail.com thanks for the help with the commit queue. I updated the tests again to further reduce the flakiness. Apologies for the back and forth on this. I try to be conservative with which tests are marked as flaky, such that the suite is as robust as possible, but it is tricky to pin them down due to the nature of the defects we are looking at.

t hepudds

No worries at all!! Mainly I’m just trying to help. Also, I think this CL is a great improvement for the ecosystem.

(Part of the reason I was asking if you could remove or unexport runtime.FindGoLeak as you’ve now done was so that you could more easily see more interesting failures without the noise of the API check complaining; more generally, it’s highly reasonable I think to start with a broader set of tests and see how they do in CI here, including it of course exercises things in different ways to be running on more platforms and more build configurations than you would want to do on your laptop or whatever you are doing for local development).

One other quick recommendation probably for later is as you are evaluating which tests are flaky or not is that `stress` is pretty handy:
https://pkg.go.dev/golang.org/x/tools/cmd/stress

You could let it run for example on a test VM overnight or similar.(Obviously, you could use a bash loop or other utilities instead instead, but `stress` is nice. Also, it can be interesting to pick the count of parallel processes with -p flag to either oversubscribe the available CPUs or undersubscribe CPUs to shake out different types of test flakiness).

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 5, 2025, 11:12:39 AMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall and Michael Pratt

Michael Knyszek added 1 comment

Patchset-level comments

File-level comment, Patchset 17:

Georgian-Vlad Saioc . resolved

@mkny...@google.com now that the bitmask issue has been addressed, should we just phase out the GOEXPERIMENT?

Michael Knyszek

keeping the GOEXPERIMENT is useful for disabling the behavior if it turns out there's a bug, since it does still touch parts of the GC. but we can set the GOEXPERIMENT enabled by default once we have (1) an approved proposal for an API, and (2) that API is implemented.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall

Michael Pratt

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 5, 2025, 11:15:51 AMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall and Michael Pratt

Michael Knyszek added 1 comment

File src/runtime/mbitmap.go

Line 1264, Patchset 20 (Latest): // (vsaioc) TODO: we could possibly be more precise by only checking against the stacks

Michael Knyszek . unresolved

this TODO is helpful, but just to match other TODOs, maybe TODO(vsaioc):?

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 5, 2025, 11:19:29 AMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall and Michael Pratt

Michael Knyszek added 1 comment

Patchset-level comments

File-level comment, Patchset 20 (Latest):

Georgian-Vlad Saioc . unresolved

@thepud...@gmail.com thanks for the help with the commit queue. I updated the tests again to further reduce the flakiness. Apologies for the back and forth on this. I try to be conservative with which tests are marked as flaky, such that the suite is as robust as possible, but it is tricky to pin them down due to the nature of the defects we are looking at.

t hepudds

No worries at all!! Mainly I’m just trying to help. Also, I think this CL is a great improvement for the ecosystem.
(Part of the reason I was asking if you could remove or unexport runtime.FindGoLeak as you’ve now done was so that you could more easily see more interesting failures without the noise of the API check complaining; more generally, it’s highly reasonable I think to start with a broader set of tests and see how they do in CI here, including it of course exercises things in different ways to be running on more platforms and more build configurations than you would want to do on your laptop or whatever you are doing for local development).
One other quick recommendation probably for later is as you are evaluating which tests are flaky or not is that `stress` is pretty handy:
https://pkg.go.dev/golang.org/x/tools/cmd/stress
You could let it run for example on a test VM overnight or similar.(Obviously, you could use a bash loop or other utilities instead instead, but `stress` is nice. Also, it can be interesting to pick the count of parallel processes with -p flag to either oversubscribe the available CPUs or undersubscribe CPUs to shake out different types of test flakiness).

Michael Knyszek

+1 to stress as a way to identify flaky tests.

as a general rule, we really don't want to check in flaky tests. if they do run, then they make CI less reliable. if they don't run, why are they there at all? it's more likely they'll just bit-rot.

are there things we can do to try to make the tests less flaky? some kind of mode we can turn on? for example, I wouldn't be surprised if conservative scanning due to asynchronous preemption is a problem. perhaps the test binary should have GODEBUG=asyncpreemptoff=1 passed to it?

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 5, 2025, 12:50:54 PMAug 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall and Michael Pratt

t hepudds added 1 comment

Patchset-level comments

File-level comment, Patchset 20 (Latest):

Georgian-Vlad Saioc . unresolved

@thepud...@gmail.com thanks for the help with the commit queue. I updated the tests again to further reduce the flakiness. Apologies for the back and forth on this. I try to be conservative with which tests are marked as flaky, such that the suite is as robust as possible, but it is tricky to pin them down due to the nature of the defects we are looking at.

t hepudds

No worries at all!! Mainly I’m just trying to help. Also, I think this CL is a great improvement for the ecosystem.
(Part of the reason I was asking if you could remove or unexport runtime.FindGoLeak as you’ve now done was so that you could more easily see more interesting failures without the noise of the API check complaining; more generally, it’s highly reasonable I think to start with a broader set of tests and see how they do in CI here, including it of course exercises things in different ways to be running on more platforms and more build configurations than you would want to do on your laptop or whatever you are doing for local development).
One other quick recommendation probably for later is as you are evaluating which tests are flaky or not is that `stress` is pretty handy:
https://pkg.go.dev/golang.org/x/tools/cmd/stress
You could let it run for example on a test VM overnight or similar.(Obviously, you could use a bash loop or other utilities instead instead, but `stress` is nice. Also, it can be interesting to pick the count of parallel processes with -p flag to either oversubscribe the available CPUs or undersubscribe CPUs to shake out different types of test flakiness).

Michael Knyszek

+1 to stress as a way to identify flaky tests.
as a general rule, we really don't want to check in flaky tests. if they do run, then they make CI less reliable. if they don't run, why are they there at all? it's more likely they'll just bit-rot.
are there things we can do to try to make the tests less flaky? some kind of mode we can turn on? for example, I wouldn't be surprised if conservative scanning due to asynchronous preemption is a problem. perhaps the test binary should have GODEBUG=asyncpreemptoff=1 passed to it?

t hepudds

(And just to make sure it was clear what I was saying in response to your “Apologies for the back and forth on this” comment — it’s 100% fine to go through some iterations of deflaking while working on a CL, and hence no need to apologize. It can help you see the other platforms by running the tests here, as well as help have a conversation about how to modify the tests and deflaking strategy and so on… though of course you would not want to _commit_ flakey tests.)

One other strategy is in some cases it can be helpful to differentiate based on the -short flag so that all.bash for example is not slowed by a test case that takes longer or needs more repetition to be robust, though there is of course a balancing act with not wanting to slow the long tests too much, and of course the long tests shouldn’t flake either. (Maybe short is already on your mind, or maybe I missed you already doing it).

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 7, 2025, 7:06:34 AMAug 7

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #21 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall

Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 7, 2025, 7:26:14 AMAug 7

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 3 comments

Patchset-level comments

File-level comment, Patchset 20:

Georgian-Vlad Saioc . unresolved

@thepud...@gmail.com thanks for the help with the commit queue. I updated the tests again to further reduce the flakiness. Apologies for the back and forth on this. I try to be conservative with which tests are marked as flaky, such that the suite is as robust as possible, but it is tricky to pin them down due to the nature of the defects we are looking at.

t hepudds

No worries at all!! Mainly I’m just trying to help. Also, I think this CL is a great improvement for the ecosystem.
(Part of the reason I was asking if you could remove or unexport runtime.FindGoLeak as you’ve now done was so that you could more easily see more interesting failures without the noise of the API check complaining; more generally, it’s highly reasonable I think to start with a broader set of tests and see how they do in CI here, including it of course exercises things in different ways to be running on more platforms and more build configurations than you would want to do on your laptop or whatever you are doing for local development).
One other quick recommendation probably for later is as you are evaluating which tests are flaky or not is that `stress` is pretty handy:
https://pkg.go.dev/golang.org/x/tools/cmd/stress
You could let it run for example on a test VM overnight or similar.(Obviously, you could use a bash loop or other utilities instead instead, but `stress` is nice. Also, it can be interesting to pick the count of parallel processes with -p flag to either oversubscribe the available CPUs or undersubscribe CPUs to shake out different types of test flakiness).

Michael Knyszek

+1 to stress as a way to identify flaky tests.
as a general rule, we really don't want to check in flaky tests. if they do run, then they make CI less reliable. if they don't run, why are they there at all? it's more likely they'll just bit-rot.
are there things we can do to try to make the tests less flaky? some kind of mode we can turn on? for example, I wouldn't be surprised if conservative scanning due to asynchronous preemption is a problem. perhaps the test binary should have GODEBUG=asyncpreemptoff=1 passed to it?

t hepudds

(And just to make sure it was clear what I was saying in response to your “Apologies for the back and forth on this” comment — it’s 100% fine to go through some iterations of deflaking while working on a CL, and hence no need to apologize. It can help you see the other platforms by running the tests here, as well as help have a conversation about how to modify the tests and deflaking strategy and so on… though of course you would not want to _commit_ flakey tests.)
One other strategy is in some cases it can be helpful to differentiate based on the -short flag so that all.bash for example is not slowed by a test case that takes longer or needs more repetition to be robust, though there is of course a balancing act with not wanting to slow the long tests too much, and of course the long tests shouldn’t flake either. (Maybe short is already on your mind, or maybe I missed you already doing it).

Georgian-Vlad Saioc

@mkny...@google.com, just to clarify, "flaky" in this context only means that some goroutine leaks for tested example are not perfectly reproducible (due to non-determinism).

The tests (or microbenchmarks if you prefer) are split between:
1. A set of simpler examples with reliably reproducible (100% of the time) goleaks, for basic functionality checks. These fail if expected leaks are not reported.
2. A set of more complex examples (derived from real-world bugs, to boot! details in the comments). However, some don't reliably leak, unfortunately, and massaging them into that state is not really worth the time.
IMO, a decent strategy here is to just check for a set of "expected" but "flaky", with three outcomes: a leak in the set occurred in the run (fine), a leak in the set did not occur in the line (fine - we just got unlucky), an unexpected leak occurred (fail - zero tolerance policy for false positives).

While this makes them less reliable for checking that leak detection works, they do stress the GC in other ways, e.g., one of these led me to discover the STW runtime bug.

I am ok with conservatively marking complex examples as "flaky leaks" for now (only a few seem to still fail CI, but just in case). But I think the extra coverage they provide makes them a valuable inclusion. What do you think?

File-level comment, Patchset 21 (Latest):

Georgian-Vlad Saioc . resolved

Hi @mkny...@google.com and @thepud...@gmail.com.

BIG UPDATE: Goroutine leak profiles! No more debug flags or unwanted APIs.

It is provided as "goroutineleak" in pprof. Under the hood, this is just a regular goroutine profile, but collecting it first triggers (and waits for) the goroutine leak finder GC, and then profiles are only collected for goroutines that have been transitioned to the leaked status.

It's not quite clear to me yet whether the implementation is desirable (plus there may be dragons here I can't see right now), so feel free to give feedback.

Also, I used stress with tests to reduce flakiness (and marked more of the microbenchmarks as flaky -- see comment below). The only one that is still a problem afaict is grpc/3017 in about 1-2% of runs because of the GC STW bug. I will try to see whether we can switch back to a non-STW, but the effort needed for that is not yet clear, given all the other changes.

File src/runtime/mbitmap.go

Line 1264, Patchset 20: // (vsaioc) TODO: we could possibly be more precise by only checking against the stacks

Michael Knyszek . resolved

this TODO is helpful, but just to match other TODOs, maybe TODO(vsaioc):?

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements

Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 7, 2025, 7:48:04 AMAug 7

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 7, 2025, 12:03:00 PMAug 7

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 3 comments

Patchset-level comments

File-level comment, Patchset 20:

Georgian-Vlad Saioc . resolved

@thepud...@gmail.com thanks for the help with the commit queue. I updated the tests again to further reduce the flakiness. Apologies for the back and forth on this. I try to be conservative with which tests are marked as flaky, such that the suite is as robust as possible, but it is tricky to pin them down due to the nature of the defects we are looking at.

t hepudds

No worries at all!! Mainly I’m just trying to help. Also, I think this CL is a great improvement for the ecosystem.
(Part of the reason I was asking if you could remove or unexport runtime.FindGoLeak as you’ve now done was so that you could more easily see more interesting failures without the noise of the API check complaining; more generally, it’s highly reasonable I think to start with a broader set of tests and see how they do in CI here, including it of course exercises things in different ways to be running on more platforms and more build configurations than you would want to do on your laptop or whatever you are doing for local development).
One other quick recommendation probably for later is as you are evaluating which tests are flaky or not is that `stress` is pretty handy:
https://pkg.go.dev/golang.org/x/tools/cmd/stress
You could let it run for example on a test VM overnight or similar.(Obviously, you could use a bash loop or other utilities instead instead, but `stress` is nice. Also, it can be interesting to pick the count of parallel processes with -p flag to either oversubscribe the available CPUs or undersubscribe CPUs to shake out different types of test flakiness).

Michael Knyszek

+1 to stress as a way to identify flaky tests.
as a general rule, we really don't want to check in flaky tests. if they do run, then they make CI less reliable. if they don't run, why are they there at all? it's more likely they'll just bit-rot.
are there things we can do to try to make the tests less flaky? some kind of mode we can turn on? for example, I wouldn't be surprised if conservative scanning due to asynchronous preemption is a problem. perhaps the test binary should have GODEBUG=asyncpreemptoff=1 passed to it?

t hepudds

(And just to make sure it was clear what I was saying in response to your “Apologies for the back and forth on this” comment — it’s 100% fine to go through some iterations of deflaking while working on a CL, and hence no need to apologize. It can help you see the other platforms by running the tests here, as well as help have a conversation about how to modify the tests and deflaking strategy and so on… though of course you would not want to _commit_ flakey tests.)
One other strategy is in some cases it can be helpful to differentiate based on the -short flag so that all.bash for example is not slowed by a test case that takes longer or needs more repetition to be robust, though there is of course a balancing act with not wanting to slow the long tests too much, and of course the long tests shouldn’t flake either. (Maybe short is already on your mind, or maybe I missed you already doing it).

Georgian-Vlad Saioc

@mkny...@google.com, just to clarify, "flaky" in this context only means that some goroutine leaks for tested example are not perfectly reproducible (due to non-determinism).
The tests (or microbenchmarks if you prefer) are split between:
1. A set of simpler examples with reliably reproducible (100% of the time) goleaks, for basic functionality checks. These fail if expected leaks are not reported.
2. A set of more complex examples (derived from real-world bugs, to boot! details in the comments). However, some don't reliably leak, unfortunately, and massaging them into that state is not really worth the time.
IMO, a decent strategy here is to just check for a set of "expected" but "flaky", with three outcomes: a leak in the set occurred in the run (fine), a leak in the set did not occur in the line (fine - we just got unlucky), an unexpected leak occurred (fail - zero tolerance policy for false positives).
While this makes them less reliable for checking that leak detection works, they do stress the GC in other ways, e.g., one of these led me to discover the STW runtime bug.
I am ok with conservatively marking complex examples as "flaky leaks" for now (only a few seem to still fail CI, but just in case). But I think the extra coverage they provide makes them a valuable inclusion. What do you think?

Michael Knyszek

an unexpected leak occurred (fail - zero tolerance policy for false positives).

this plus your stress testing argument seem like a good reason to keep these tests running and just be flexible about their output.

SGTM. I still need to review the test code, but I understand where you're coming from now. thanks!

File-level comment, Patchset 21 (Latest):

Georgian-Vlad Saioc . resolved

Hi @mkny...@google.com and @thepud...@gmail.com.
BIG UPDATE: Goroutine leak profiles! No more debug flags or unwanted APIs.
It is provided as "goroutineleak" in pprof. Under the hood, this is just a regular goroutine profile, but collecting it first triggers (and waits for) the goroutine leak finder GC, and then profiles are only collected for goroutines that have been transitioned to the leaked status.
It's not quite clear to me yet whether the implementation is desirable (plus there may be dragons here I can't see right now), so feel free to give feedback.
Also, I used stress with tests to reduce flakiness (and marked more of the microbenchmarks as flaky -- see comment below). The only one that is still a problem afaict is grpc/3017 in about 1-2% of runs because of the GC STW bug. I will try to see whether we can switch back to a non-STW, but the effort needed for that is not yet clear, given all the other changes.

Michael Knyszek

I left a comment about the way the profile is written out at a high level. I think most of the goroutine profile machinery is overkill, since we can exploit a simple invariant: leaked goroutines stay leaked, and will not run. (I know they switch back to _Gwaiting, for the deadlock finding GC cycle, but that's fine. if you want to protect against that, you can take gcsema for the duration as well.)

I will try to see whether we can switch back to a non-STW, but the effort needed for that is not yet clear, given all the other changes.

I really don't think it'll require any effort, really. I'll tinker with it a bit in case I'm missing something, but like I said before, all the work the analysis is doing is with the world stopped anyway. the only difference is that user goroutines will be allowed to run.

File src/runtime/mprof.go

Line 1336, Patchset 21 (Latest): stw := stopTheWorld(stwGoroutineProfile)

Michael Knyszek . unresolved

I think these two STW pauses and setting up all the goroutine profile machinery is overkill. goroutines that are leaked will stay leaked. also, it's stack won't change. I think you can just do forEachGRace and call doGoroutineProfile. keep acquiring goroutineProfileSema so that you can reuse its state.

if you're worried about a concurrent GC starting, you can semacquire(&gcsema) to prevent a new GC from starting, but I'm not even sure that's necessary either.

this implementation will also record goroutines that are not leaked. the goroutine profile machinery catches goroutines before they start running after the STW. but leaked goroutines will never run.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 7, 2025, 12:03:22 PMAug 7

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

File src/runtime/gc_test.go

File-level comment, Patchset 21 (Latest):

Michael Knyszek . unresolved

FYI this file is marked executable. please remove the executable bit.

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 11, 2025, 4:57:44 AMAug 11

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #22 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall

Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 11, 2025, 5:04:35 AMAug 11

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 3 comments

File src/runtime/gc_test.go

File-level comment, Patchset 21:

Michael Knyszek . resolved

FYI this file is marked executable. please remove the executable bit.

Georgian-Vlad Saioc

Done

File src/runtime/mgc.go

Line 576, Patchset 5:func FindGoLeaks() {

Michael Knyszek . resolved

you know this but just leaving a note that this is a new API, so this we definitely can't add. this should be an unexported function that is made available to runtime/pprof (or whatever we plan to do) via linkname. (and we'll explicitly blocklist the linkname in the compiler for use outside of runtime/pprof.)

t hepudds

Hi @vsa...@uber.com, I know this is still WIP, but I'm curious if you are currently using `FindGoLeaks` anywhere, vs. if it could be unexported or removed for now?
That can happen later of course, but I'll confess it would be exciting to see more of the builders go green with passing tests, and it might be easier for you to see at a glance which failures are actually interesting (e.g., it looks like currently the race detector is complaining about some of the tests), vs. right now `FindGoLeaks` being exported is one of the things the builders are complaining about.
If for example you are using `FindGoLeaks` for test functions that are not yet published in this CL, in addition to what Michael said above, another option might be to export it just for testing via the `runtime/export_test.go` file (which as it says there -- `Export guts for testing.`) or similar. Alternatively, it can be pretty easy for the runtime to "push" an unexported symbol so that it is available in another package, with this as an concrete example that might make sense to emulate (this is the runtime pushing an unexported `newobject` runtime function into a separate `maps` package):
https://github.com/golang/go/blob/e666972a674eb96f5847c7bbeb5c6e1bcb572af5/src/runtime/malloc.go#L1816-L1819
https://github.com/golang/go/blob/e666972a674eb96f5847c7bbeb5c6e1bcb572af5/src/internal/runtime/maps/runtime.go#L33-L34
There are multiple ways to do linknames, and I'm not sure which one is most appropriate for you (especially for the final version), but wanted to at least leave a pointer to an example I had followed in the past.
In any event, maybe this would be a distraction right now, and depending on what you are using `FindGoLeaks` for currently, some or all of what I wrote above might not be relevant.
(Also, maybe it's actually obvious how you are using `FindGoLeaks` currently, and maybe I just missed it 🙃).

Georgian-Vlad Saioc

Closing this for now, because we switched to a profile format for this.

File src/runtime/mgcmark.go

Line 1205, Patchset 14: success = work.markrootNext.CompareAndSwap(next, next+1)

Michael Knyszek . resolved

I think you can implement this as an atomic add *if* findGoLeaks resets markrootNext to markrootJobs (before the add) as well. this way it's totally fine if markrootNext temporarily exceeds markrootJobs. also, the add is wait-free, unlike this CAS loop.

Georgian-Vlad Saioc

I couldn't quite get the atomic add to work for leak detection, but the behavior is switched back to the original implementation during regular GC cycles.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements

Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 11, 2025, 5:14:44 AMAug 11

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #23 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 11, 2025, 8:13:08 AMAug 11

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 11, 2025, 8:55:01 AMAug 11

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #24 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result+1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 11, 2025, 11:12:47 AMAug 11

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 4 comments

Patchset-level comments

File-level comment, Patchset 24 (Latest):

Georgian-Vlad Saioc . resolved

Hi @thepud...@gmail.com. I pushed another set of changes to the tests. Flakiness has been drastically driven down (30m running with stress at 0 failures). Some tests are also now using Gosched and GOMAXPROCS=1 to force the schedule into deterministic behavior, and that also allowed the moving of several more microbenchmarks from "flaky" to "guaranteed leaks".

File src/runtime/mgc.go

Line 735, Patchset 14:

	} else if work.goroutineLeakFinder.pending.Load() || debug.gcgoroutineleaks > 0 {
		// If goroutine leak detection has been enabled (via GODEBUG=gcgoroutineleaks=1),
		// or via profiling, stop the world during the marking phase.
		mode = gcForceMode

Michael Knyszek . resolved

hm, actually... taking a step back... why is this necessary at all? basically all the work your algorithm does is during the STW anyway... is it actually necessary?
what is it doing while the GC is actively running concurrently?
the roots are updated while the world is already stopped
the algorithm is run while the world is already stopped
untracking the sync objects happens while the world is stopped

Georgian-Vlad Saioc

The paper version ran concurrent GC, meaning it should be doable, but I cannot vouch yet for stability with all the other changes we made since then. I will keep you posted on the progress.

Georgian-Vlad Saioc

Done

Line 739, Patchset 13: mode = gcForceBlockMode

Michael Knyszek . resolved

IIRC gcForceBlockMode is overkill. you need the mark phase to be STW, but this also makes the sweep phase STW. I think gcForceMode might be sufficient, unless I'm forgetting some other detail.

Georgian-Vlad Saioc

Acknowledged
On a different note, I found a bug when switching between concurrent and STW-GC on the fly (and this applies to the Go runtime in general, not just goroutine leak GC; it is even reproducible for Go1.22.5 without any Golf changes). Consider the following sequence:
1. Concurrent GC is running concurrently with user code, which triggers GC assists.
2. During the sweeping phase, some GC assists call gcMarkDone, once of which acquires the work.markDoneSema
3. A goroutine leak (or just mark-STW) GC cycle is initiated. This will preempt all goroutines, including the GC assists.
4. The assist which acquired work.markDoneSema is preempted mid-release of work.markDoneSema. This can happen because the GC assists increment work.nwait before calling gcMarkDone, meaning the GC may start transitioning into sweeping and starting a new GC cycle while many assists may still be trying to clear gcMarkDone.
5. At this point, the GC assists cannot be resumed while marking because they are flagged as user code (sched.user.disable=true and the schedEnabled property returns false).
6. The GC mark workers go on with their business, but once they call gcMarkDone, they subsequently block acquiring work.markDoneSema, which has not been released by the GC assist.
7. The scheduler will run checkdead while trying to run system goroutines. But, eventually, the number of idle M's will eventually cap out (passing the run == 0 check).
8. However, we will actually not get a global deadlock report! Instead, because some runnable (but not schedulable) user goroutines are still in the system, we will get a "checkdead: runnable g" panic.
The only reason we did not discover this earlier is because all GC cycles were uniform (set by GODEBUG flag) also in our paper variant. It's only the opt-in approach we added that triggers this (and, even there, it has that Heisenbug flavor).
I suppose the way to correct this is, upon initiating a goroutine leak GC cycle, to force every GC assist to clear the critical section in gcMarkDone protected by work.markDoneSema.
However, I think it's best we coordinate on this one, in order to get as clean a solution as we can. If you think this should go in a separate change, please let me know.

Michael Knyszek

I think you found a real bug, but see my other comment. is it actually necessary to set `mode = gcForceMode` at all? AFAICT the entire algorithm runs with the world stopped anyway.

Georgian-Vlad Saioc

Done

File src/runtime/mprof.go

Line 1336, Patchset 21: stw := stopTheWorld(stwGoroutineProfile)

Michael Knyszek . resolved

I think these two STW pauses and setting up all the goroutine profile machinery is overkill. goroutines that are leaked will stay leaked. also, it's stack won't change. I think you can just do forEachGRace and call doGoroutineProfile. keep acquiring goroutineProfileSema so that you can reuse its state.

if you're worried about a concurrent GC starting, you can semacquire(&gcsema) to prevent a new GC from starting, but I'm not even sure that's necessary either.
this implementation will also record goroutines that are not leaked. the goroutine profile machinery catches goroutines before they start running after the STW. but leaked goroutines will never run.

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 11, 2025, 11:13:14 AMAug 11

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #25 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 11, 2025, 11:18:36 AMAug 11

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 11, 2025, 11:20:25 AMAug 11

to Gerrit Bot, goph...@pubsubhelper.golang.org, t hepudds, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 24:

Georgian-Vlad Saioc . resolved

@mkny...@google.com I addressed the comments regarding STW for goroutine leak profiling, and have re-enabled concurrent GC during leak detection. @thepud...@gmail.com looking forward to trying CI again and other experiments.

@mkny...@google.com, btw I had a FIXME for `goroutine(Leak)ProfileWithLabelsConcurrent`. There is the `if n != int(endOffset)` check at lines 1378 and 1488 in runtime/mprof.go that does not seem to do anything, but maybe I am misreading something. I copied it for leak detection variant, but maybe this can be removed in both? (Or just for leaks for now and on another branch for regular goroutine profiles)

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 12, 2025, 5:32:24 AMAug 12

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #26 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 12, 2025, 7:58:02 AMAug 12

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 12, 2025, 8:18:33 AMAug 12

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 26 (Latest):

Georgian-Vlad Saioc . resolved

Hi @thepud...@gmail.com, thanks for submitting to CQ again. For some reason it seems like the Cockroach584 test keeps failing on Windows AMD64, but I can't tell why. I thought that increasing the number of yields in the main goroutine will allow the leaky goroutine to get to that state, but it seems that did not help. I don't get any flakiness locally (MacOS-ARM). Given the consistent failures, do you think there is something specific to this OS/arch combo that causes the flakiness?

The program is in `testgoroutineleakgc/cockroach584.go`. if you want to have a look. No need to over-exert. If we can't resolve this quickly, I'll just mark it as flaky and call it a day.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 12, 2025, 8:19:22 AMAug 12

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 1 comment

Patchset-level comments

File-level comment, Patchset 26 (Latest):

Georgian-Vlad Saioc . unresolved

Hi @thepud...@gmail.com, thanks for submitting to CQ again. For some reason it seems like the Cockroach584 test keeps failing on Windows AMD64, but I can't tell why. I thought that increasing the number of yields in the main goroutine will allow the leaky goroutine to get to that state, but it seems that did not help. I don't get any flakiness locally (MacOS-ARM). Given the consistent failures, do you think there is something specific to this OS/arch combo that causes the flakiness?
The program is in `testgoroutineleakgc/cockroach584.go`. if you want to have a look. No need to over-exert. If we can't resolve this quickly, I'll just mark it as flaky and call it a day.

Georgian-Vlad Saioc

Marked as resolved by mistake

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 13, 2025, 4:42:30 AMAug 13

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #27 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 13, 2025, 6:56:18 AMAug 13

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 14, 2025, 7:58:08 AMAug 14

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds added 1 comment

Patchset-level comments

File-level comment, Patchset 27 (Latest):

t hepudds . resolved

Hi Vlad, I noticed you asking about the `runtime.TestGdbBacktrace` failure in the runtime diagnostics meeting notes.

You might have already come to this conclusion, but FWIW that's probably a pre-existing and unrelated flake -- maybe #74355 "runtime: TestGdbBacktrace failures", which was closed a few weeks ago as duplicate of the open issue #58932 "runtime: TestGdb*".

And just in case you are not familiar with `watchflakes`, if you suspect you are seeing a test flake unrelated to your change, there's a good chance it will be already tracked as a `watchflakes` issue in GitHub, so searching for the failing test name or error message in the issue tracker often finds something (or you can [browse recently updated watchflakes issues](https://github.com/golang/go/issues?q=is%3Aissue%20watchflakes%20sort%3Aupdated-desc)).

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 14, 2025, 7:58:15 AMAug 14

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Aug 14, 2025, 9:40:54 AMAug 14

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds added 1 comment

Patchset-level comments

File-level comment, Patchset 26:

Georgian-Vlad Saioc . unresolved

Hi @thepud...@gmail.com, thanks for submitting to CQ again. For some reason it seems like the Cockroach584 test keeps failing on Windows AMD64, but I can't tell why. I thought that increasing the number of yields in the main goroutine will allow the leaky goroutine to get to that state, but it seems that did not help. I don't get any flakiness locally (MacOS-ARM). Given the consistent failures, do you think there is something specific to this OS/arch combo that causes the flakiness?
The program is in `testgoroutineleakgc/cockroach584.go`. if you want to have a look. No need to over-exert. If we can't resolve this quickly, I'll just mark it as flaky and call it a day.

Georgian-Vlad Saioc

Marked as resolved by mistake

t hepudds

Hi Vlad, I don't know why in particular that builder might fail that test more frequently, but in general different LUCI trybots might be underpowered or oversubscribed, or some phenomena might partially depend on the exact count of logical cores, or timing of different CPUs, etc. I think it's reasonable to move that test to flaky (which I think you've now done).

Separately, two other options to maybe consider:

1. Maybe running the test programs N times in a loop to find the required detected leaks (at least once) might make all the tests more robust against later failing intermittently in LUCI for other people. (e.g., maybe one of the current detections not marked flaky right now will be flaky on riscv64 or some other trybot). That said, I don't know how long they take to execute currently.

2. There is for example a runtime test flag `-run_concurrent_map_tests` which defaults to false and is used to "also run flaky concurrent map tests". You could consider having a loosely similar flag that runs your tests for longer and/or with more repetitions while also attempting to validate the detections for some (or all?) of the tests currently marked flaky (that is, fail the test if the expected goroutine leaks are not detected). This would default to off, but could be something that you run yourself for example while polishing this CL (maybe even overnight if needed). It could stick around for a while, or you could leave a TODO (for example maybe near the definition of the GOEXPERIMENT) to also suggest considering removing that flag when the GOEXPERIMENT is removed. In general, I think it's not at all desirable for every CL to show up with its own custom test flags, but there is some precedent, and might be worthwhile to at least consider in this case. Alternatively, maybe you could create some external GitHub repo with your examples that could serve a similar purpose (or maybe you already have one 😅). Part of the reason I'm wondering about this is that it might also let you shift more tests into the "flaky" category (and increase overall test robustness) if you have some convenient way to validate the detections for the "flaky" tests.

----

Two quick examples for running the test program N times:

Here's an example of a detection that reportedly worked ~99.95% of the time, so the solution I think was to execute the test program 10 times in a row:

https://github.com/golang/go/blob/320df537cc554610aacdc12a183944eb89db8337/src/runtime/crash_test.go#L782-L816

This example also uses a loop and uses whether `-short` is set to pick the iteration count (so `all.bash` does fewer iterations than the "longtest" trybots), though here I think the sense of this test is reversed from yours, so I suspect not exactly what you would do:

https://github.com/golang/go/blob/320df537cc554610aacdc12a183944eb89db8337/src/runtime/proc_test.go#L1014-L1026

-------

For defining a (perhaps temporary) test flag that does more work and/or attempts to validate more than a normal test, here are some quick examples from stdlib via grep:

```
runtime/crash_test.go: flag.Bool("run_concurrent_map_tests", false, "also run flaky concurrent map tests")
runtime/map_benchmark_test.go: flag.Bool("mapbench", false, "enable the full set of map benchmark variants")
runtime/malloc_test.go: flag.Int("n", 1000, "number of goroutines")
crypto/mlkem/mlkem_test.go: flag.Bool("million", false, "run the million vector test")
crypto/rsa/rsa_test.go: flag.Bool("all", false, "test all key sizes up to 2048")
go/types/hilbert_test.go: flag.Int("H", 5, "Hilbert matrix size")
```

In any event, exciting to see the progress, including exciting to see the green LUCI results!

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 14, 2025, 9:46:42 AMAug 14

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #28 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result+1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 21, 2025, 1:55:17 PMAug 21

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

File src/runtime/mgc.go

Line 1132, Patchset 14: return isMarkedOrNotInHeap(gp.waiting.elem.get())

Michael Knyszek . unresolved

I think we may need to explicitly queue these with the GC if we discover them to be dead.
here's my thinking. imagine the following scenario.
1. we discover leaked goroutines. we do not mark their elements.
2. the mark phase completes, and we begin the sweep phase.
3. during the sweep phase, these objects are freed.
4. these objects are reallocated (same address).
(4) is problematic for a few reasons.
future analysis finds that the object *is* marked. the goroutine is no longer "leaked".
future allocations allocate *another* sync primitive to the same address. I worry about what could happen if the primitive occupies the same spot in the sema trees, which is indexed by address.

Georgian-Vlad Saioc

I don't think queueing with the GC is necessary. On principle, the goroutine leak GC should mark all the memory the regular GC does. After detecting goroutine leaks, the GC transitions into the state s.t. leaks no longer need to be reported, and the mark phase is resumed to mark everything that is only reachable through leaked gs.

Michael Knyszek

gcRestoreSyncObjects is only executed *after* the mark phase completes. if it was called and we continued another GC round after findGoLeaks returned true, then I'd agree, but as it stands, those pointers will be missing to the GC throughout leak detection, AFAICT.

what am I missing?

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Aug 21, 2025, 2:44:11 PMAug 21

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek voted and added 23 comments

Votes added by Michael Knyszek

Commit-Queue

+1

23 comments

Patchset-level comments

File-level comment, Patchset 28 (Latest):

Michael Knyszek . resolved

looking great. my only functional comments are on the new stuff, the profile stuff, and just that one thread about whether the sync primitives will correctly be kept alive.

everything else is just super minor nits.

I don't expect to have many comments after this -- this should be ready to land after next review round, I think. the code overall looks great.

thank you so much for sticking with this! and thanks so much for the extensive test suite. it's really great.

File src/internal/goexperiment/exp_goleakfindergc_on.go

File-level comment, Patchset 28 (Latest):

Michael Knyszek . unresolved

this filename seems strange, shouldn't it be `exp_goroutineleakfindergc_on.go` if it was generated?

File src/internal/goexperiment/flags.go

Line 125, Patchset 28 (Latest): GoroutineLeakFinderGC bool

Michael Knyszek . unresolved

I like that this is descriptive, but I think we should pick a shorter name for this, haha. it should be something descriptive but memorable (so that users understand what they're enabling/disabling and that we don't have to keep looking it up constantly).

to that end, I don't think the "GC" part is doing too much here. I'm not sure who should care about the fact that it's part of the GC.

"finder" is also a little bit outside of the names we use for other diagnostic tools. it would be nice to align it with something else to make it easier to remember.

here are some other bikeshed colors:

`gosan` as in `goroutine sanitizer` which aligns with the sanitizer names.
`goleakdetector`, which aligns with "race detector."
`goleakprofiler`, which aligns with our profilers.

I don't feel too strongly about any of this, so if one of these or something else sounds good to you, then that's fine with me. I'm personally partial to `goleakprofiler` since we're targeting the `runtime/pprof` package to begin with.

File src/net/http/pprof/pprof.go

Line 368, Patchset 28 (Latest):

	"goroutineleak": "Stack traces of all leaked goroutines. Use debug=2 as a query parameter to export in the same format as an unrecovered panic.",

Michael Knyszek . unresolved

I don't know how much trouble it would be for you, but it would be good to hide this behind the GOEXPERIMENT as well, just since the proposal isn't accepted yet.

File src/runtime/crash_test.go

Line 191, Patchset 28 (Latest): // TODO(61405): Remove this once it's enabled by default.

Michael Knyszek . unresolved

nit: go.dev/issue/61405.

File src/runtime/gc_test.go

File-level comment, Patchset 28 (Latest):

Michael Knyszek . resolved

you _very_ thorough test suite, including real-world examples, makes me so very happy. thank you for your diligence here. 💯 💯 💯

Line 1101, Patchset 28 (Latest):func TestGoroutineLeakGC(t *testing.T) {

Michael Knyszek . resolved

I made comments about the name of the GOEXPERIMENT, but here I think the use of "GC" is a helpful hint to the people working on the implementation, since it's clearer that it's part of the GC. so please leave this, if you do change the name.

Line 1108, Patchset 28 (Latest): // Repetitions are used to amortize flakiness in some tests.

Michael Knyszek . unresolved

nit: reduce

Line 1114, Patchset 28 (Latest): // flakyLeaks are goroutine leaks that are too flaky to be reliably detected.

Michael Knyszek . unresolved

maybe mention that they do detect real leaks. i.e. they're not false positives. just flaky.

Line 1124, Patchset 28 (Latest):

		name string,
		flaky bool,
		leaks ...string) testCase {

Michael Knyszek . unresolved

nit: this can all go on one line. we generally don't care about line length in this codebase (barring really extreme cases).

File src/runtime/mgc.go

Line 385, Patchset 28 (Latest): // The GC has been instructed to perform goroutine leak detection during the next GC cycle;

Michael Knyszek . unresolved

"If set to true, the GC has been instructed..."

just so there's no ambiguity.

Line 386, Patchset 28 (Latest): // it is set by DetectGoroutineLeaks() and unset during gcStart().

Michael Knyszek . unresolved

stale?

Line 386, Patchset 28 (Latest): // it is set by DetectGoroutineLeaks() and unset during gcStart().

Michael Knyszek . unresolved

we tend to not put parens after functions in comments. I would say drop this here and elsewhere, but it's a very minor nit.

Line 388, Patchset 28 (Latest): // The GC is running in goroutine leak detection mode; it is set during gcStart()

Michael Knyszek . unresolved

super minor nit: other comments in this code base tend to say something like "Indicates that" for this kind of signaling state.

Line 389, Patchset 28 (Latest): // and unset during gcMarkTermination(). Is protected by STW.

Michael Knyszek . unresolved

super minor nit: to match the voice of other comments, maybe just "Protected by STW." and put that on its own line so it's not buried in the text.

Line 572, Patchset 28 (Latest)://go:linkname runtime_goroutineLeakGC runtime/pprof.runtime_goroutineLeakGC

Michael Knyszek . unresolved

we probably want to explicitly blocklist this linkname, since it unfortunately becomes visible to everyone. see go.dev/cl/681497 for how to do that (it should be trivial.)

Line 585, Patchset 28 (Latest): // actually performs goroutine leak detection.

Michael Knyszek . unresolved

might be worth a comment as to why this is necessary, and that we only expect one call to be made in the vast majority of cases.

Line 813, Patchset 28 (Latest): if work.goroutineLeakFinder.pending.Load() {

Michael Knyszek . unresolved

a quick signpost comment here would be good.

Line 816, Patchset 28 (Latest): gcUntrackSyncObjects()

Michael Knyszek . unresolved

a signpost comment explaining the purpose of this line here would be helpful.

Line 1201, Patchset 28 (Latest): // Report goroutine leaks and mark them unreachable, and resume marking

Michael Knyszek . unresolved

assertWorldStopped()?

I think this function and its callees rely on the world being stopped at this point.

File src/runtime/mprof.go

Line 1336, Patchset 28 (Latest):

	// Using gleakcount while the world is stopped should give us a consistent view
	// of the number of leaked goroutines.

Michael Knyszek . unresolved

not true since the world is not stopped here anymore. :)

Line 1348, Patchset 28 (Latest):

	// Prepare for all other goroutines to enter the profile. Every goroutine struct in the allgs list
	// has its goroutineProfiled field cleared. Any goroutine created from this point on (while
	// goroutineProfile.active is set) will start with its goroutineProfiled
	// field set to goroutineProfileSatisfied.
	goroutineProfile.active = true
	goroutineProfile.records = p
	goroutineProfile.labels = labels

Michael Knyszek . unresolved

this still seems unnecessary to me.

why not just call doRecordGoroutineProfile instead of tryRecordGoroutineProfile below? is there some reason that doesn't work?

then we don't need to modify goroutineProfile at all, nor iterate over all the goroutines and set goroutineProfileAbsent.

File src/runtime/proc.go

Line 5565, Patchset 28 (Latest): lock(&allglock)

Michael Knyszek . unresolved

I think this is overkill.

I'm not super thrilled either about being non-preemptible for an O(goroutine) operation, which can potentially hold up a GC for a long time.

here's an idea: what if you just kept a global atomic count? at the end of every goroutine leak detection cycle, simply tally up the number, and store it in the global. then, this function is trivial, just a single atomic load. no need to worry about anything else.

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Aug 27, 2025, 5:57:08 AMAug 27

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #29 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result+1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Aug 27, 2025, 6:11:34 AMAug 27

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, Michael Knyszek, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Georgian-Vlad Saioc added 22 comments

Patchset-level comments

File-level comment, Patchset 26:

Georgian-Vlad Saioc

Re-introduced repetitions for both flaky and mandatory tests. For mandatory tests, this makes them more robust (if at least one run produces the expected leak, it passes). The short flag is used to reduce the number of repetitions of flaky tests from 21 to 3

File-level comment, Patchset 28:

Georgian-Vlad Saioc . resolved

Hi again @mkny...@google.com and @thepud...@gmail.com. I tried to address almost all comments except for one by @mkny...@google.com, that being moving the `"goroutineleak"` pprof flag behind goexperiment.

Looking at how the code has developed, I noticed that there are now only 4 places protected by the experimental flag, anyway, all of them in `src/runtime/sema.go`:

1. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#300
2. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#481
3. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#605
3. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#628

none of which have any particularly large impact. All the other changes are gated behind booleans added to the `work` GC controller, which are at a type level, anyway.

What do think about taking out the experiment entirely, and simply temporarily hiding the goroutine leak profile flag from the documentation and profile listing (with a note to add later)?
This way, people won't be alerted that a new profile exists, unless they dig into the code, and we can also use it within Uber at a large scale during the next release with minimal friction for our deployment pipelines.

Looking forward to your response.

BTW, @mkny...@google.com, I left a comment where we use `shade` to add the concurrency primitives causing leaks to the GC work queue. Let me know if there is a better way to do this.

File src/internal/goexperiment/exp_goleakfindergc_on.go

File-level comment, Patchset 28:

Michael Knyszek . resolved

this filename seems strange, shouldn't it be `exp_goroutineleakfindergc_on.go` if it was generated?

Georgian-Vlad Saioc

When we were going through different iterations for the name of the flag, I simply edited it by hand and forgot to rename it 😐.

File src/internal/goexperiment/flags.go

Line 125, Patchset 28: GoroutineLeakFinderGC bool

Michael Knyszek . unresolved

I like that this is descriptive, but I think we should pick a shorter name for this, haha. it should be something descriptive but memorable (so that users understand what they're enabling/disabling and that we don't have to keep looking it up constantly).
to that end, I don't think the "GC" part is doing too much here. I'm not sure who should care about the fact that it's part of the GC.
"finder" is also a little bit outside of the names we use for other diagnostic tools. it would be nice to align it with something else to make it easier to remember.
here are some other bikeshed colors:
`gosan` as in `goroutine sanitizer` which aligns with the sanitizer names.
`goleakdetector`, which aligns with "race detector."
`goleakprofiler`, which aligns with our profilers.
I don't feel too strongly about any of this, so if one of these or something else sounds good to you, then that's fine with me. I'm personally partial to `goleakprofiler` since we're targeting the `runtime/pprof` package to begin with.

Georgian-Vlad Saioc

I am partial to `goleakprofiler`, too. You're right that the users' main focus will be how they interact with the tool, not the internals.

File src/runtime/crash_test.go

Line 191, Patchset 28: // TODO(61405): Remove this once it's enabled by default.

Michael Knyszek . unresolved

nit: go.dev/issue/61405.

Georgian-Vlad Saioc

How about removing this comment completely? The range-over-int experiment at "go.dev/issue/61405" should now have been enabled by default (I think this TODO is a holdover from before a rebase on main).

File src/runtime/gc_test.go

Line 1108, Patchset 28: // Repetitions are used to amortize flakiness in some tests.

Michael Knyszek . resolved

nit: reduce

Georgian-Vlad Saioc

Done

Line 1114, Patchset 28: // flakyLeaks are goroutine leaks that are too flaky to be reliably detected.

Michael Knyszek . resolved

maybe mention that they do detect real leaks. i.e. they're not false positives. just flaky.

Georgian-Vlad Saioc

Done

Line 1124, Patchset 28: name string,


		flaky bool,
		leaks ...string) testCase {

Michael Knyszek . resolved

nit: this can all go on one line. we generally don't care about line length in this codebase (barring really extreme cases).

Georgian-Vlad Saioc

Done

File src/runtime/mgc.go

Line 385, Patchset 28: // The GC has been instructed to perform goroutine leak detection during the next GC cycle;

Michael Knyszek . resolved

"If set to true, the GC has been instructed..."
just so there's no ambiguity.

Georgian-Vlad Saioc

Done

Line 386, Patchset 28: // it is set by DetectGoroutineLeaks() and unset during gcStart().

Michael Knyszek . resolved

stale?

Georgian-Vlad Saioc

Done

Line 386, Patchset 28: // it is set by DetectGoroutineLeaks() and unset during gcStart().

Michael Knyszek . resolved

we tend to not put parens after functions in comments. I would say drop this here and elsewhere, but it's a very minor nit.

Georgian-Vlad Saioc

Done

Line 388, Patchset 28: // The GC is running in goroutine leak detection mode; it is set during gcStart()

Michael Knyszek . resolved

super minor nit: other comments in this code base tend to say something like "Indicates that" for this kind of signaling state.

Georgian-Vlad Saioc

Done

Line 389, Patchset 28: // and unset during gcMarkTermination(). Is protected by STW.

Michael Knyszek . resolved

super minor nit: to match the voice of other comments, maybe just "Protected by STW." and put that on its own line so it's not buried in the text.

Georgian-Vlad Saioc

Done

Line 572, Patchset 28://go:linkname runtime_goroutineLeakGC runtime/pprof.runtime_goroutineLeakGC

Michael Knyszek . resolved

we probably want to explicitly blocklist this linkname, since it unfortunately becomes visible to everyone. see go.dev/cl/681497 for how to do that (it should be trivial.)

Georgian-Vlad Saioc

Done

Line 585, Patchset 28: // actually performs goroutine leak detection.

Michael Knyszek . resolved

might be worth a comment as to why this is necessary, and that we only expect one call to be made in the vast majority of cases.

Georgian-Vlad Saioc

Done

Line 813, Patchset 28: if work.goroutineLeakFinder.pending.Load() {

Michael Knyszek . resolved

a quick signpost comment here would be good.

Georgian-Vlad Saioc

Done

Line 816, Patchset 28: gcUntrackSyncObjects()

Michael Knyszek . resolved

a signpost comment explaining the purpose of this line here would be helpful.

Georgian-Vlad Saioc

Done

Line 1132, Patchset 14: return isMarkedOrNotInHeap(gp.waiting.elem.get())

Michael Knyszek . resolved

I think we may need to explicitly queue these with the GC if we discover them to be dead.

here's my thinking. imagine the following scenario.
1. we discover leaked goroutines. we do not mark their elements.
2. the mark phase completes, and we begin the sweep phase.
3. during the sweep phase, these objects are freed.
4. these objects are reallocated (same address).
(4) is problematic for a few reasons.
future analysis finds that the object *is* marked. the goroutine is no longer "leaked".
future allocations allocate *another* sync primitive to the same address. I worry about what could happen if the primitive occupies the same spot in the sema trees, which is indexed by address.

Georgian-Vlad Saioc

I don't think queueing with the GC is necessary. On principle, the goroutine leak GC should mark all the memory the regular GC does. After detecting goroutine leaks, the GC transitions into the state s.t. leaks no longer need to be reported, and the mark phase is resumed to mark everything that is only reachable through leaked gs.

Michael Knyszek

gcRestoreSyncObjects is only executed *after* the mark phase completes. if it was called and we continued another GC round after findGoLeaks returned true, then I'd agree, but as it stands, those pointers will be missing to the GC throughout leak detection, AFAICT.
what am I missing?

Georgian-Vlad Saioc

I added a call to `shade` for the `elem` or `c` fields (depending on the wait reason) right after a goroutine is transitioned into the leaked state. Is this the right way to do it?

Line 1201, Patchset 28: // Report goroutine leaks and mark them unreachable, and resume marking

Michael Knyszek . resolved

assertWorldStopped()?
I think this function and its callees rely on the world being stopped at this point.

Georgian-Vlad Saioc

Done

File src/runtime/mprof.go

Line 1336, Patchset 28: // Using gleakcount while the world is stopped should give us a consistent view


	// of the number of leaked goroutines.

Michael Knyszek . resolved

not true since the world is not stopped here anymore. :)

Georgian-Vlad Saioc

Done

Line 1348, Patchset 28: // Prepare for all other goroutines to enter the profile. Every goroutine struct in the allgs list


	// has its goroutineProfiled field cleared. Any goroutine created from this point on (while
	// goroutineProfile.active is set) will start with its goroutineProfiled
	// field set to goroutineProfileSatisfied.
	goroutineProfile.active = true
	goroutineProfile.records = p
	goroutineProfile.labels = labels

Michael Knyszek . resolved

this still seems unnecessary to me.
why not just call doRecordGoroutineProfile instead of tryRecordGoroutineProfile below? is there some reason that doesn't work?
then we don't need to modify goroutineProfile at all, nor iterate over all the goroutines and set goroutineProfileAbsent.

Georgian-Vlad Saioc

Done

File src/runtime/proc.go

Line 5565, Patchset 28: lock(&allglock)

Michael Knyszek . resolved

I think this is overkill.

I'm not super thrilled either about being non-preemptible for an O(goroutine) operation, which can potentially hold up a GC for a long time.
here's an idea: what if you just kept a global atomic count? at the end of every goroutine leak detection cycle, simply tally up the number, and store it in the global. then, this function is trivial, just a single atomic load. no need to worry about anything else.

Georgian-Vlad Saioc

It doesn't even need to be atomic. Every write would be STW-protected, while all other accesses are reads.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Sep 2, 2025, 9:05:32 PMSep 2

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek voted and added 10 comments

Votes added by Michael Knyszek

Code-Review

+2

10 comments

Patchset-level comments

File-level comment, Patchset 28:

Georgian-Vlad Saioc . resolved

Hi again @mkny...@google.com and @thepud...@gmail.com. I tried to address almost all comments except for one by @mkny...@google.com, that being moving the `"goroutineleak"` pprof flag behind goexperiment.
Looking at how the code has developed, I noticed that there are now only 4 places protected by the experimental flag, anyway, all of them in `src/runtime/sema.go`:
1. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#300
2. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#481
3. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#605
3. https://go-review.googlesource.com/c/go/+/688335/29/src/runtime/sema.go#628
none of which have any particularly large impact. All the other changes are gated behind booleans added to the `work` GC controller, which are at a type level, anyway.
What do think about taking out the experiment entirely, and simply temporarily hiding the goroutine leak profile flag from the documentation and profile listing (with a note to add later)?
This way, people won't be alerted that a new profile exists, unless they dig into the code, and we can also use it within Uber at a large scale during the next release with minimal friction for our deployment pipelines.
Looking forward to your response.
BTW, @mkny...@google.com, I left a comment where we use `shade` to add the concurrency primitives causing leaks to the GC work queue. Let me know if there is a better way to do this.

Michael Knyszek

What do think about taking out the experiment entirely, and simply temporarily hiding the goroutine leak profile flag from the documentation and profile listing (with a note to add later)?

I support removing the experiment, especially before the Go release. you're right that it's not really protecting from much, the flags to enable the analysis inside the GC already prevent the code from getting exercised if you don't use it explicitly, so there's an obvious workaround if there turns out to be a bug or something.

This way, people won't be alerted that a new profile exists, unless they dig into the code, and we can also use it within Uber at a large scale during the next release with minimal friction for our deployment pipelines.

I'm not a huge fan of having the change land with a new public API that's just obscured since it goes against our typical procedures. in what ways does doing this make things easier on your side? I assume you mean Uber's internal release process and not the next Go release (since I would like to have this landed and done for Go 1.26, ideally). it definitely would be great to get some large-scale testing, it's just that typically if we're not 100% sure about an API yet, or it doesn't have an accepted proposal, we put the API behind a GOEXPERIMENT (see: arenas and synctest).

I think to get this change landed ASAP and avoid issues with merge conflicts, it would be ideal to keep the API changes behind a GOEXPERIMENT so we can land it under normal procedures. it's only a 1 line change to enable the GOEXPERIMENT by default for Uber, but I'm also not sure how annoying it is to do any patching of the toolchain for you.

File-level comment, Patchset 29 (Latest):

Michael Knyszek . resolved

looks great! thanks Vlad. just a few trivial comments. +2, though I'd still like us to resolve the API-behind-a-GOEXPERIMENT question before landing, so I'll leave the Hold+1.

File src/internal/goexperiment/flags.go

Line 125, Patchset 28: GoroutineLeakFinderGC bool

Michael Knyszek . unresolved

I like that this is descriptive, but I think we should pick a shorter name for this, haha. it should be something descriptive but memorable (so that users understand what they're enabling/disabling and that we don't have to keep looking it up constantly).
to that end, I don't think the "GC" part is doing too much here. I'm not sure who should care about the fact that it's part of the GC.
"finder" is also a little bit outside of the names we use for other diagnostic tools. it would be nice to align it with something else to make it easier to remember.
here are some other bikeshed colors:
`gosan` as in `goroutine sanitizer` which aligns with the sanitizer names.
`goleakdetector`, which aligns with "race detector."
`goleakprofiler`, which aligns with our profilers.
I don't feel too strongly about any of this, so if one of these or something else sounds good to you, then that's fine with me. I'm personally partial to `goleakprofiler` since we're targeting the `runtime/pprof` package to begin with.

Georgian-Vlad Saioc

I am partial to `goleakprofiler`, too. You're right that the users' main focus will be how they interact with the tool, not the internals.

Michael Knyszek

if we remove the GOEXPERIMENT, that's one fewer bikeshed to paint! :D

File src/runtime/crash_test.go

Line 191, Patchset 28: // TODO(61405): Remove this once it's enabled by default.

Michael Knyszek . unresolved

nit: go.dev/issue/61405.

Georgian-Vlad Saioc

How about removing this comment completely? The range-over-int experiment at "go.dev/issue/61405" should now have been enabled by default (I think this TODO is a holdover from before a rebase on main).

Michael Knyszek

SGTM

File src/runtime/mgc.go

Line 161, Patchset 29 (Latest):var gleakcount int32

Michael Knyszek . unresolved

why not just put this in work.goleakProfiler? it's a little odd to have it be a totally naked global when IMO it seems to be pretty clearly associated with those other fields. you're linkname-exporting a wrapper function anyway, so it doesn't really matter where it is.

Line 161, Patchset 29 (Latest):var gleakcount int32

Michael Knyszek . unresolved

any reason why an int32? what if someone leaks 2^31 goroutines!? the minimum amount of wasted memory is a measly 4 TiB! :P

(in all seriousness, seeing an int32 counter always makes me ask questions, whereas an int64 would just be trivially and obviously never a problem. I would say just promote it.)

Line 1132, Patchset 14: return isMarkedOrNotInHeap(gp.waiting.elem.get())

Michael Knyszek . resolved

I think we may need to explicitly queue these with the GC if we discover them to be dead.
here's my thinking. imagine the following scenario.
1. we discover leaked goroutines. we do not mark their elements.
2. the mark phase completes, and we begin the sweep phase.
3. during the sweep phase, these objects are freed.
4. these objects are reallocated (same address).
(4) is problematic for a few reasons.
future analysis finds that the object *is* marked. the goroutine is no longer "leaked".
future allocations allocate *another* sync primitive to the same address. I worry about what could happen if the primitive occupies the same spot in the sema trees, which is indexed by address.

Georgian-Vlad Saioc

I don't think queueing with the GC is necessary. On principle, the goroutine leak GC should mark all the memory the regular GC does. After detecting goroutine leaks, the GC transitions into the state s.t. leaks no longer need to be reported, and the mark phase is resumed to mark everything that is only reachable through leaked gs.

Michael Knyszek

gcRestoreSyncObjects is only executed *after* the mark phase completes. if it was called and we continued another GC round after findGoLeaks returned true, then I'd agree, but as it stands, those pointers will be missing to the GC throughout leak detection, AFAICT.
what am I missing?

Georgian-Vlad Saioc

I added a call to `shade` for the `elem` or `c` fields (depending on the wait reason) right after a goroutine is transitioned into the leaked state. Is this the right way to do it?

Michael Knyszek

that looks right to me! `shade` is certainly the simplest way.

Line 1285, Patchset 29 (Latest):

		// from the goroutine's stack, but this is not guaranteed for future
		// versions of Go.

Michael Knyszek . unresolved

I think you can just say "let's play it safe." I realize this concern is mostly theoretical, and I think it's OK to be honest about that.

File src/runtime/proc.go

Line 5556, Patchset 29 (Latest):

// gleakcount returns the number of leaked goroutines currently reported by
// the runtime.
//
//go:linkname runtime_gleakcount runtime/pprof.runtime_gleakcount
func runtime_gleakcount() int32 {
	return gleakcount

Michael Knyszek . resolved

FYI, you can actually linkname a global variable directly, which is originally why I thought you called it just `var gleakcount int32`.

Line 5565, Patchset 28: lock(&allglock)

Michael Knyszek . resolved

I think this is overkill.
I'm not super thrilled either about being non-preemptible for an O(goroutine) operation, which can potentially hold up a GC for a long time.
here's an idea: what if you just kept a global atomic count? at the end of every goroutine leak detection cycle, simply tally up the number, and store it in the global. then, this function is trivial, just a single atomic load. no need to worry about anything else.

Georgian-Vlad Saioc

It doesn't even need to be atomic. Every write would be STW-protected, while all other accesses are reads.

Michael Knyszek

🎉

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Pratt
t hepudds

Submit Requirements:

Code-Review

No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

satisfied_requirement

unsatisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Sep 5, 2025, 9:27:49 AMSep 5

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #30 to this change.

Following approvals got outdated and were removed:

Code-Review: +2 by Michael Knyszek

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall

Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review

No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Emmanuel Odeke (Gerrit)

unread,

Sep 5, 2025, 9:35:16 AMSep 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Michael Knyszek, Go LUCI, t hepudds, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Emmanuel Odeke voted and added 1 comment

Votes added by Emmanuel Odeke

Commit-Queue	+1
Run-TryBot	+1

1 comment

Commit Message

Line 9, Patchset 30 (Latest):runtime: detect goroutine leaks via the DetectDeadlocks API

Emmanuel Odeke . unresolved

Please remove this duplicated line as it is already in the subject.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Georgian-Vlad Saioc
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review

Legacy-TryBots-Pass

No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Sep 5, 2025, 10:29:02 AMSep 5

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Emmanuel Odeke, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke

Georgian-Vlad Saioc
Keith Randall
Michael Knyszek
Michael Pratt

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Sep 8, 2025, 7:04:23 AMSep 8

to Gerrit Bot, goph...@pubsubhelper.golang.org, t hepudds, Go LUCI, Emmanuel Odeke, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Knyszek and Michael Pratt

Georgian-Vlad Saioc added 7 comments

Patchset-level comments

File-level comment, Patchset 26:

Georgian-Vlad Saioc . resolved

Hi @thepud...@gmail.com, thanks for submitting to CQ again. For some reason it seems like the Cockroach584 test keeps failing on Windows AMD64, but I can't tell why. I thought that increasing the number of yields in the main goroutine will allow the leaky goroutine to get to that state, but it seems that did not help. I don't get any flakiness locally (MacOS-ARM). Given the consistent failures, do you think there is something specific to this OS/arch combo that causes the flakiness?

Georgian-Vlad Saioc

Closing this for now, since we seem to hit green consistently.

File-level comment, Patchset 29:

Georgian-Vlad Saioc . resolved

Hi again @mkny...@google.com,

A few highlights regarding the experiment:

1. Removing it from the semaRoot.queue/dequeue helped me realize that the implementation can be simplified closer to the original implementation! Removing the bitmasks means we no longer need to distinguish between `sync` library and non-`sync` library semas. One more branch gone on a semi-hot path 🎉.

2. I did what you asked regarding the listing of `goroutineleak` as a profile in `net/pprof`, by only adding it if the `GOEXPERIMENT` flag is set. Note that it's only for the listed profile descriptions, i.e., you can still collect this profile if you know how.
This way we get the best of both worlds: people can opt in via flag, but with the right know-how, you can still use it. It's still somewhat stealthy, but the key point for us in Uber is being able to access it without requesting deployment changes from service owners. I'm sure you can picture it how they might (understandably) bristle at the mere mention of "experiment". This approach will seem much less disruptive, as they'll just perceive it as a feature of the Go runtime.

Cheers!

P. S. Just realized I pushed the changes but forgot to hit Reply on the draft here 😐 sorry for the delay.

File src/net/http/pprof/pprof.go

Line 368, Patchset 28:

	"goroutineleak": "Stack traces of all leaked goroutines. Use debug=2 as a query parameter to export in the same format as an unrecovered panic.",

Michael Knyszek . resolved

I don't know how much trouble it would be for you, but it would be good to hide this behind the GOEXPERIMENT as well, just since the proposal isn't accepted yet.

Georgian-Vlad Saioc

Done

File src/runtime/crash_test.go

Line 191, Patchset 28: // TODO(61405): Remove this once it's enabled by default.

Michael Knyszek . resolved

nit: go.dev/issue/61405.

Georgian-Vlad Saioc

How about removing this comment completely? The range-over-int experiment at "go.dev/issue/61405" should now have been enabled by default (I think this TODO is a holdover from before a rebase on main).

Michael Knyszek

SGTM

Georgian-Vlad Saioc

Done

File src/runtime/mgc.go

Line 161, Patchset 29:var gleakcount int32

Michael Knyszek . unresolved

any reason why an int32? what if someone leaks 2^31 goroutines!? the minimum amount of wasted memory is a measly 4 TiB! :P
(in all seriousness, seeing an int32 counter always makes me ask questions, whereas an int64 would just be trivially and obviously never a problem. I would say just promote it.)

Georgian-Vlad Saioc

It was meant to have parity with the `func gcount() int32` in `src/runtime/proc.go`, which is used for similar purposes in `src/runtime/mprof.go`, but I'm all for switching to simply `int` (fewer type coercions throughout the code base).

Line 161, Patchset 29:var gleakcount int32

Michael Knyszek . resolved

why not just put this in work.goleakProfiler? it's a little odd to have it be a totally naked global when IMO it seems to be pretty clearly associated with those other fields. you're linkname-exporting a wrapper function anyway, so it doesn't really matter where it is.

Georgian-Vlad Saioc

Done

Line 1285, Patchset 29:

		// from the goroutine's stack, but this is not guaranteed for future
		// versions of Go.

Michael Knyszek . resolved

I think you can just say "let's play it safe." I realize this concern is mostly theoretical, and I think it's OK to be honest about that.

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke

Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
Legacy-TryBots-Pass
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Sep 8, 2025, 7:17:30 AMSep 8

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #31 to this change.

Following approvals got outdated and were removed:

Legacy-TryBots-Pass: Run-TryBot+1 by Emmanuel Odeke

TryBots-Pass: LUCI-TryBot-Result+1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review

No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Sep 8, 2025, 7:20:57 AMSep 8

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, Emmanuel Odeke, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Georgian-Vlad Saioc (Gerrit)

unread,

Sep 8, 2025, 8:38:42 AMSep 8

to Gerrit Bot, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Emmanuel Odeke, Michael Knyszek, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Knyszek and Michael Pratt

Georgian-Vlad Saioc added 1 comment

Commit Message

Line 9, Patchset 30:runtime: detect goroutine leaks via the DetectDeadlocks API

Emmanuel Odeke . resolved

Please remove this duplicated line as it is already in the subject.

Georgian-Vlad Saioc

Done

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Sep 8, 2025, 8:43:31 AMSep 8

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #32 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Keith Randall
Michael Knyszek
Michael Pratt

t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Sep 10, 2025, 3:24:48 AMSep 10

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #33 to this change.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Keith Randall
Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Sep 10, 2025, 2:11:32 PMSep 10

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Emmanuel Odeke, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 2 comments

Patchset-level comments

File-level comment, Patchset 33 (Latest):

Michael Knyszek . unresolved

I just realized that the testdata directory files all need license headers. please add them. thanks!

File src/internal/goexperiment/flags.go

Line 125, Patchset 28: GoroutineLeakFinderGC bool

Michael Knyszek . unresolved

I like that this is descriptive, but I think we should pick a shorter name for this, haha. it should be something descriptive but memorable (so that users understand what they're enabling/disabling and that we don't have to keep looking it up constantly).
to that end, I don't think the "GC" part is doing too much here. I'm not sure who should care about the fact that it's part of the GC.
"finder" is also a little bit outside of the names we use for other diagnostic tools. it would be nice to align it with something else to make it easier to remember.
here are some other bikeshed colors:
`gosan` as in `goroutine sanitizer` which aligns with the sanitizer names.
`goleakdetector`, which aligns with "race detector."
`goleakprofiler`, which aligns with our profilers.
I don't feel too strongly about any of this, so if one of these or something else sounds good to you, then that's fine with me. I'm personally partial to `goleakprofiler` since we're targeting the `runtime/pprof` package to begin with.

Georgian-Vlad Saioc

I am partial to `goleakprofiler`, too. You're right that the users' main focus will be how they interact with the tool, not the internals.

Michael Knyszek

if we remove the GOEXPERIMENT, that's one fewer bikeshed to paint! :D

Michael Knyszek

in the mini-proposal to get this landed, austin@ suggested removing the "r" so just "goroutineleakprofile"

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Keith Randall

Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Sep 10, 2025, 2:17:31 PMSep 10

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Emmanuel Odeke, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 2 comments

Patchset-level comments

File-level comment, Patchset 29:

Georgian-Vlad Saioc . resolved

Hi again @mkny...@google.com,
A few highlights regarding the experiment:
1. Removing it from the semaRoot.queue/dequeue helped me realize that the implementation can be simplified closer to the original implementation! Removing the bitmasks means we no longer need to distinguish between `sync` library and non-`sync` library semas. One more branch gone on a semi-hot path 🎉.
2. I did what you asked regarding the listing of `goroutineleak` as a profile in `net/pprof`, by only adding it if the `GOEXPERIMENT` flag is set. Note that it's only for the listed profile descriptions, i.e., you can still collect this profile if you know how.
This way we get the best of both worlds: people can opt in via flag, but with the right know-how, you can still use it. It's still somewhat stealthy, but the key point for us in Uber is being able to access it without requesting deployment changes from service owners. I'm sure you can picture it how they might (understandably) bristle at the mere mention of "experiment". This approach will seem much less disruptive, as they'll just perceive it as a feature of the Go runtime.
Cheers!
P. S. Just realized I pushed the changes but forgot to hit Reply on the draft here 😐 sorry for the delay.

Michael Knyszek

nice!

re: accessing even without the GOEXPERIMENT set, how are you deploying this in Uber currently? are you running a tip Go toolchain, or are you backporting the patch to Go 1.24/Go 1.25? if it's the latter, why not just turn on the experiment by default in the patch?

I am sympathetic to teams getting uncomfortable at hearing "experiment," but the stealthiness undermines the proposal process a bit. it's true that it's not like this is landing in a public release, so it's probably not a big deal. but we will have to put the functionality behind the GOEXPERIMENT if for whatever reason we can't push the proposal through the process before the release. (to be clear, I don't foresee that being a problem, just setting expectations.)

File src/runtime/mgc.go

Line 161, Patchset 29:var gleakcount int32

Michael Knyszek . unresolved

any reason why an int32? what if someone leaks 2^31 goroutines!? the minimum amount of wasted memory is a measly 4 TiB! :P
(in all seriousness, seeing an int32 counter always makes me ask questions, whereas an int64 would just be trivially and obviously never a problem. I would say just promote it.)

Georgian-Vlad Saioc

It was meant to have parity with the `func gcount() int32` in `src/runtime/proc.go`, which is used for similar purposes in `src/runtime/mprof.go`, but I'm all for switching to simply `int` (fewer type coercions throughout the code base).

Michael Knyszek

I think the `int32` for `gcount` is a historical artifact. using simply `int` SGTM.

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Georgian-Vlad Saioc

unsatisfied_requirement

satisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Sep 10, 2025, 2:17:36 PMSep 10

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Emmanuel Odeke, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek voted Commit-Queue+1

Commit-Queue

+1

unsatisfied_requirement

satisfied_requirement

open

diffy

Michael Knyszek (Gerrit)

unread,

Sep 10, 2025, 2:18:20 PMSep 10

to Gerrit Bot, Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, Go LUCI, t hepudds, Emmanuel Odeke, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Georgian-Vlad Saioc, Keith Randall, Michael Pratt and t hepudds

Michael Knyszek added 1 comment

Patchset-level comments

File-level comment, Patchset 30:

Gopher Robot . resolved

TryBots beginning. Status page: https://farmer.golang.org/try?commit=213b31ac

Michael Knyszek

Acknowledged

unsatisfied_requirement

satisfied_requirement

open

diffy

Gerrit Bot (Gerrit)

unread,

Sep 12, 2025, 6:42:29 AMSep 12

to Georgian-Vlad Saioc, goph...@pubsubhelper.golang.org, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek, Michael Pratt and t hepudds

Gerrit Bot uploaded new patchset

Gerrit Bot uploaded patch set #34 to this change.

Following approvals got outdated and were removed:

TryBots-Pass: LUCI-TryBot-Result-1 by Go LUCI

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Georgian-Vlad Saioc
Keith Randall

Michael Knyszek
Michael Pratt
t hepudds

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement

TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

t hepudds (Gerrit)

unread,

Sep 12, 2025, 7:26:02 AMSep 12

to Georgian-Vlad Saioc, Gerrit Bot, goph...@pubsubhelper.golang.org, Michael Knyszek, Go LUCI, Emmanuel Odeke, Keith Randall, Michael Pratt, Milind Chabbi, Austin Clements, Gopher Robot, golang-co...@googlegroups.com

Attention needed from Austin Clements, Emmanuel Odeke, Georgian-Vlad Saioc, Keith Randall, Michael Knyszek and Michael Pratt

t hepudds voted Commit-Queue+1

Commit-Queue

+1

Open in Gerrit

Related details

Attention is currently required from:

Austin Clements
Emmanuel Odeke
Georgian-Vlad Saioc
Keith Randall
Michael Knyszek
Michael Pratt

Submit Requirements:

Code-Review
No-Holds
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

[go] Proposal #74609: goroutine leak detection by using the garbage collector

Gerrit Bot (Gerrit)

Gerrit Bot has uploaded the change for review

Commit message

Change diff

Change information

Related details

Gopher Robot (Gerrit)

Gopher Robot added 1 comment

Related details

Gopher Robot (Gerrit)

Message from Gopher Robot

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Georgian-Vlad Saioc (Gerrit)

Georgian-Vlad Saioc added 2 comments

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Keith Randall (Gerrit)

Keith Randall voted and added 1 comment

Votes added by Keith Randall

1 comment

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

t hepudds (Gerrit)

t hepudds added 2 comments

Related details

Michael Knyszek (Gerrit)

Michael Knyszek added 20 comments

Related details

Michael Knyszek (Gerrit)

Michael Knyszek added 1 comment

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Georgian-Vlad Saioc (Gerrit)

Georgian-Vlad Saioc added 18 comments

Related details

Georgian-Vlad Saioc (Gerrit)

Georgian-Vlad Saioc added 1 comment

Michael Knyszek (Gerrit)

Michael Knyszek added 1 comment

Related details

Michael Knyszek (Gerrit)

Michael Knyszek added 1 comment

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Georgian-Vlad Saioc (Gerrit)

Georgian-Vlad Saioc added 5 comments

Related details

Georgian-Vlad Saioc (Gerrit)

Georgian-Vlad Saioc added 1 comment

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

Gerrit Bot (Gerrit)

Gerrit Bot uploaded new patchset

Related details

t hepudds (Gerrit)

t hepudds voted Commit-Queue+1

Related details