From 0e70f5d841d41455be92f6c7d3c4e00467b7bdf1 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Mon, 19 Sep 2022 18:34:42 -0400
Subject: [PATCH] runtime: export total GC Assist ns in MemStats and GCStats

At a high level, the runtime garbage collector can impact user goroutine
latency in two ways. The first is that it pauses all goroutines during
its stop-the-world sweep termination and mark termination phases. The
second is that it backpressures memory allocations by instructing user
goroutines to assist with scanning and marking in response to a high
allocation rate.

There is plenty of observability into the first of these sources of
user-visible latency. There is significantly less observability into the
second. As a result, it is often more difficult to diagnose latency
problems due to over-assist (e.g. #14812, #27732, #40225). To this
point, the ways to determine that GC assist was a problem were to use
execution tracing or to use GODEBUG=gctrace=1 tracing, neither of which
is easy to access programmatically in a running system. CPU profiles
also give some insight, but are rarely as instructive as one might
expect because heavy GC assist time is scattered across a profile.
Notice even in https://tip.golang.org/doc/gc-guide, the guidance on
recognizing and remedying performance problems due to GC assist is
sparse.

This commit adds a counter to the MemStats and GCStats structs called
AssistTotalNs, which tracks the cumulative nanoseconds in GC assist
since the program started. This provides a new form of observability
into GC assist delays, and one that can be manipulated programmatically.

There's more work to be done in this area. This feels like a reasonable
first step.
---
 src/runtime/debug/garbage.go      |  8 +++++---
 src/runtime/debug/garbage_test.go |  3 +++
 src/runtime/malloc_test.go        |  4 ++--
 src/runtime/mgc.go                |  1 +
 src/runtime/mstats.go             | 17 +++++++++++++++--
 src/runtime/pprof/pprof.go        |  2 ++
 6 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/runtime/debug/garbage.go b/src/runtime/debug/garbage.go
index 0f53928fe8530f..f8117270992b52 100644
--- a/src/runtime/debug/garbage.go
+++ b/src/runtime/debug/garbage.go
@@ -18,6 +18,7 @@ type GCStats struct {
 	Pause          []time.Duration // pause history, most recent first
 	PauseEnd       []time.Time     // pause end times history, most recent first
 	PauseQuantiles []time.Duration
+	AssistTotal    time.Duration // total assist across all collections
 }
 
 // ReadGCStats reads statistics about garbage collection into stats.
@@ -35,8 +36,8 @@ func ReadGCStats(stats *GCStats) {
 	// for end times history and as a temporary buffer for
 	// computing quantiles.
 	const maxPause = len(((*runtime.MemStats)(nil)).PauseNs)
-	if cap(stats.Pause) < 2*maxPause+3 {
-		stats.Pause = make([]time.Duration, 2*maxPause+3)
+	if cap(stats.Pause) < 2*maxPause+4 {
+		stats.Pause = make([]time.Duration, 2*maxPause+4)
 	}
 
 	// readGCStats fills in the pause and end times histories (up to
@@ -46,10 +47,11 @@ func ReadGCStats(stats *GCStats) {
 	// nanoseconds, so the pauses and the total pause time do not need
 	// any conversion.
 	readGCStats(&stats.Pause)
-	n := len(stats.Pause) - 3
+	n := len(stats.Pause) - 4
 	stats.LastGC = time.Unix(0, int64(stats.Pause[n]))
 	stats.NumGC = int64(stats.Pause[n+1])
 	stats.PauseTotal = stats.Pause[n+2]
+	stats.AssistTotal = stats.Pause[n+3]
 	n /= 2 // buffer holds pauses and end times
 	stats.Pause = stats.Pause[:n]
 
diff --git a/src/runtime/debug/garbage_test.go b/src/runtime/debug/garbage_test.go
index 7213bbe641dd6e..058a9a49aff54a 100644
--- a/src/runtime/debug/garbage_test.go
+++ b/src/runtime/debug/garbage_test.go
@@ -36,6 +36,9 @@ func TestReadGCStats(t *testing.T) {
 	if stats.PauseTotal != time.Duration(mstats.PauseTotalNs) {
 		t.Errorf("stats.PauseTotal = %d, but mstats.PauseTotalNs = %d", stats.PauseTotal, mstats.PauseTotalNs)
 	}
+	if stats.AssistTotal != time.Duration(mstats.AssistTotalNs) {
+		t.Errorf("stats.AssistTotal = %d, but mstats.AssistTotalNs = %d", stats.AssistTotal, mstats.AssistTotalNs)
+	}
 	if stats.LastGC.UnixNano() != int64(mstats.LastGC) {
 		t.Errorf("stats.LastGC.UnixNano = %d, but mstats.LastGC = %d", stats.LastGC.UnixNano(), mstats.LastGC)
 	}
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index cc2007604ddb6f..8bf8d099b9b8ec 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -62,7 +62,7 @@ func TestMemStats(t *testing.T) {
 			return fmt.Errorf("want %v", x)
 		}
 	}
-	// Of the uint fields, HeapReleased, HeapIdle can be 0.
+	// Of the uint fields, HeapReleased, HeapIdle, AssistTotalNs can be 0.
 	// PauseTotalNs can be 0 if timer resolution is poor.
 	fields := map[string][]func(any) error{
 		"Alloc": {nz, le(1e10)}, "TotalAlloc": {nz, le(1e11)}, "Sys": {nz, le(1e10)},
@@ -74,7 +74,7 @@ func TestMemStats(t *testing.T) {
 		"MCacheInuse": {nz, le(1e10)}, "MCacheSys": {nz, le(1e10)},
 		"BuckHashSys": {nz, le(1e10)}, "GCSys": {nz, le(1e10)}, "OtherSys": {nz, le(1e10)},
 		"NextGC": {nz, le(1e10)}, "LastGC": {nz},
-		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil,
+		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil, "AssistTotalNs": {le(1e10)},
 		"NumGC": {nz, le(1e9)}, "NumForcedGC": {nz, le(1e9)},
 		"GCCPUFraction": {le(0.99)}, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
 		"BySize": nil,
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 6a9d278187fb79..183a4c68db839a 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1003,6 +1003,7 @@ func gcMarkTermination() {
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
 	memstats.pause_total_ns += uint64(work.pauseNS)
+	memstats.assist_total_ns += uint64(gcController.assistTime)
 
 	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
 	// We report idle marking time below, but omit it from the
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 3a5273f3619cf6..ba8f49135eb2d2 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -36,6 +36,7 @@ type mstats struct {
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
 	pause_end       [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
+	assist_total_ns uint64
 	numgc           uint32
 	numforcedgc     uint32  // number of user-forced GCs
 	gc_cpu_fraction float64 // fraction of CPU time used by GC
@@ -275,6 +276,16 @@ type MemStats struct {
 	// last pause in a cycle.
 	PauseEnd [256]uint64
 
+	// AssistTotalNs is the cumulative nanoseconds in GC assist since
+	// the program started.
+	//
+	// During GC assist, user goroutines yield some of their time to
+	// assist the GC with scanning and marking in response to a high
+	// allocation rate. A large amount of cumulative time spent here
+	// indicates that the application is likely out-pacing the GC
+	// with respect to how fast it is allocating.
+	AssistTotalNs uint64
+
 	// NumGC is the number of completed GC cycles.
 	NumGC uint32
 
@@ -530,6 +541,7 @@ func readmemstats_m(stats *MemStats) {
 	stats.PauseTotalNs = memstats.pause_total_ns
 	stats.PauseNs = memstats.pause_ns
 	stats.PauseEnd = memstats.pause_end
+	stats.AssistTotalNs = memstats.assist_total_ns
 	stats.NumGC = memstats.numgc
 	stats.NumForcedGC = memstats.numforcedgc
 	stats.GCCPUFraction = memstats.gc_cpu_fraction
@@ -556,7 +568,7 @@ func readGCStats(pauses *[]uint64) {
 func readGCStats_m(pauses *[]uint64) {
 	p := *pauses
 	// Calling code in runtime/debug should make the slice large enough.
-	if cap(p) < len(memstats.pause_ns)+3 {
+	if cap(p) < len(memstats.pause_ns)+4 {
 		throw("short slice passed to readGCStats")
 	}
 
@@ -582,8 +594,9 @@ func readGCStats_m(pauses *[]uint64) {
 	p[n+n] = memstats.last_gc_unix
 	p[n+n+1] = uint64(memstats.numgc)
 	p[n+n+2] = memstats.pause_total_ns
+	p[n+n+3] = memstats.assist_total_ns
 	unlock(&mheap_.lock)
-	*pauses = p[:n+n+3]
+	*pauses = p[:n+n+4]
 }
 
 // flushmcache flushes the mcache of allp[i].
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index 3a7191e092e7b9..2b6b53b5899b62 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -636,8 +636,10 @@ func writeHeapInternal(w io.Writer, debug int, defaultSampleType string) error {
 
 	fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
 	fmt.Fprintf(w, "# LastGC = %d\n", s.LastGC)
+	fmt.Fprintf(w, "# PauseTotalNs = %d\n", s.PauseTotalNs)
 	fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
 	fmt.Fprintf(w, "# PauseEnd = %d\n", s.PauseEnd)
+	fmt.Fprintf(w, "# AssistTotalNs = %d\n", s.AssistTotalNs)
 	fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
 	fmt.Fprintf(w, "# NumForcedGC = %d\n", s.NumForcedGC)
 	fmt.Fprintf(w, "# GCCPUFraction = %v\n", s.GCCPUFraction)