|
| 1 | +// Copyright 2019 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// Scavenging free pages. |
| 6 | +// |
| 7 | +// This file implements scavenging (the release of physical pages backing mapped |
| 8 | +// memory) of free and unused pages in the heap as a way to deal with page-level |
| 9 | +// fragmentation and reduce the RSS of Go applications. |
| 10 | +// |
| 11 | +// Scavenging in Go happens on two fronts: there's the background |
| 12 | +// (asynchronous) scavenger and the heap-growth (synchronous) scavenger. |
| 13 | +// |
| 14 | +// The former happens on a goroutine much like the background sweeper which is |
| 15 | +// soft-capped at using scavengePercent of the mutator's time, based on |
| 16 | +// order-of-magnitude estimates of the costs of scavenging. The background |
| 17 | +// scavenger's primary goal is to bring the estimated heap RSS of the |
| 18 | +// application down to a goal. |
| 19 | +// |
| 20 | +// That goal is defined as (retainExtraPercent+100) / 100 * next_gc. |
| 21 | +// |
| 22 | +// The goal is updated after each GC and the scavenger's pacing parameters |
| 23 | +// (which live in mheap_) are updated to match. The pacing parameters work much |
| 24 | +// like the background sweeping parameters. The parameters define a line whose |
| 25 | +// horizontal axis is time and vertical axis is estimated heap RSS, and the |
| 26 | +// scavenger attempts to stay below that line at all times. |
| 27 | +// |
| 28 | +// The synchronous heap-growth scavenging happens whenever the heap grows in |
| 29 | +// size, for some definition of heap-growth. The intuition behind this is that |
| 30 | +// the application had to grow the heap because existing fragments were |
| 31 | +// not sufficiently large to satisfy a page-level memory allocation, so we |
| 32 | +// scavenge those fragments eagerly to offset the growth in RSS that results. |
| 33 | + |
| 34 | +package runtime |
| 35 | + |
| 36 | +const ( |
| 37 | + // The background scavenger is paced according to these parameters. |
| 38 | + // |
| 39 | + // scavengePercent represents the portion of mutator time we're willing |
| 40 | + // to spend on scavenging in percent. |
| 41 | + // |
| 42 | + // scavengePageLatency is a worst-case estimate (order-of-magnitude) of |
| 43 | + // the time it takes to scavenge one (regular-sized) page of memory. |
| 44 | + // scavengeHugePageLatency is the same but for huge pages. |
| 45 | + // |
| 46 | + // scavengePagePeriod is derived from scavengePercent and scavengePageLatency, |
| 47 | + // and represents the average time between scavenging one page that we're |
| 48 | + // aiming for. scavengeHugePagePeriod is the same but for huge pages. |
| 49 | + // These constants are core to the scavenge pacing algorithm. |
| 50 | + scavengePercent = 1 // 1% |
| 51 | + scavengePageLatency = 10e3 // 10µs |
| 52 | + scavengeHugePageLatency = 10e3 // 10µs |
| 53 | + scavengePagePeriod = scavengePageLatency / (scavengePercent / 100.0) |
| 54 | + scavengeHugePagePeriod = scavengePageLatency / (scavengePercent / 100.0) |
| 55 | + |
| 56 | + // retainExtraPercent represents the amount of memory over the heap goal |
| 57 | + // that the scavenger should keep as a buffer space for the allocator. |
| 58 | + // |
| 59 | + // The purpose of maintaining this overhead is to have a greater pool of |
| 60 | + // unscavenged memory available for allocation (since using scavenged memory |
| 61 | + // incurs an additional cost), to account for heap fragmentation and |
| 62 | + // the ever-changing layout of the heap. |
| 63 | + retainExtraPercent = 10 |
| 64 | +) |
| 65 | + |
| 66 | +// heapRetained returns an estimate of the current heap RSS. |
| 67 | +// |
| 68 | +// mheap_.lock must be held or the world must be stopped. |
| 69 | +func heapRetained() uint64 { |
| 70 | + return memstats.heap_sys - memstats.heap_released |
| 71 | +} |
| 72 | + |
| 73 | +// gcPaceScavenger updates the scavenger's pacing, particularly |
| 74 | +// its rate and RSS goal. |
| 75 | +// |
| 76 | +// The RSS goal is based on the current heap goal with a small overhead |
| 77 | +// to accomodate non-determinism in the allocator. |
| 78 | +// |
| 79 | +// The pacing is based on scavengePageRate, which applies to both regular and |
| 80 | +// huge pages. See that constant for more information. |
| 81 | +// |
| 82 | +// mheap_.lock must be held or the world must be stopped. |
| 83 | +func gcPaceScavenger() { |
| 84 | + // Compute our scavenging goal and align it to a physical page boundary |
| 85 | + // to make the following calculations more exact. |
| 86 | + retainedGoal := memstats.next_gc |
| 87 | + // Add retainExtraPercent overhead to retainedGoal. This calculation |
| 88 | + // looks strange but the purpose is to arrive at an integer division |
| 89 | + // (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8) |
| 90 | + // that also avoids the overflow from a multiplication. |
| 91 | + retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0)) |
| 92 | + retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1) |
| 93 | + |
| 94 | + // Represents where we are now in the heap's contribution to RSS in bytes. |
| 95 | + // |
| 96 | + // Guaranteed to always be a multiple of physPageSize on systems where |
| 97 | + // physPageSize <= pageSize since we map heap_sys at a rate larger than |
| 98 | + // any physPageSize and released memory in multiples of the physPageSize. |
| 99 | + // |
| 100 | + // However, certain functions recategorize heap_sys as other stats (e.g. |
| 101 | + // stack_sys) and this happens in multiples of pageSize, so on systems |
| 102 | + // where physPageSize > pageSize the calculations below will not be exact. |
| 103 | + // Generally this is OK since we'll be off by at most one regular |
| 104 | + // physical page. |
| 105 | + retainedNow := heapRetained() |
| 106 | + |
| 107 | + // If we're already below our goal, publish the goal in case it changed |
| 108 | + // then disable the background scavenger. |
| 109 | + if retainedNow <= retainedGoal { |
| 110 | + mheap_.scavengeRetainedGoal = retainedGoal |
| 111 | + mheap_.scavengeBytesPerNS = 0 |
| 112 | + return |
| 113 | + } |
| 114 | + |
| 115 | + // Now we start to compute the total amount of work necessary and the total |
| 116 | + // amount of time we're willing to give the scavenger to complete this work. |
| 117 | + // This will involve calculating how much of the work consists of huge pages |
| 118 | + // and how much consists of regular pages since the former can let us scavenge |
| 119 | + // more memory in the same time. |
| 120 | + totalWork := retainedNow - retainedGoal |
| 121 | + |
| 122 | + // On systems without huge page support, all work is regular work. |
| 123 | + regularWork := totalWork |
| 124 | + hugeTime := uint64(0) |
| 125 | + |
| 126 | + // On systems where we have huge pages, we want to do as much of the |
| 127 | + // scavenging work as possible on huge pages, because the costs are the |
| 128 | + // same per page, but we can give back more more memory in a shorter |
| 129 | + // period of time. |
| 130 | + if physHugePageSize != 0 { |
| 131 | + // Start by computing the amount of free memory we have in huge pages |
| 132 | + // in total. Trivially, this is all the huge page work we need to do. |
| 133 | + hugeWork := uint64(mheap_.free.unscavHugePages * physHugePageSize) |
| 134 | + |
| 135 | + // ...but it could turn out that there's more huge work to do than |
| 136 | + // total work, so cap it at total work. This might happen for very large |
| 137 | + // heaps where the additional factor of retainExtraPercent can make it so |
| 138 | + // that there are free chunks of memory larger than a huge page that we don't want |
| 139 | + // to scavenge. |
| 140 | + if hugeWork >= totalWork { |
| 141 | + hugePages := totalWork / uint64(physHugePageSize) |
| 142 | + hugeWork = hugePages * uint64(physHugePageSize) |
| 143 | + } |
| 144 | + // Everything that's not huge work is regular work. At this point we |
| 145 | + // know huge work so we can calculate how much time that will take |
| 146 | + // based on scavengePageRate (which applies to pages of any size). |
| 147 | + regularWork = totalWork - hugeWork |
| 148 | + hugeTime = hugeWork / uint64(physHugePageSize) * scavengeHugePagePeriod |
| 149 | + } |
| 150 | + // Finally, we can compute how much time it'll take to do the regular work |
| 151 | + // and the total time to do all the work. |
| 152 | + regularTime := regularWork / uint64(physPageSize) * scavengePagePeriod |
| 153 | + totalTime := hugeTime + regularTime |
| 154 | + |
| 155 | + now := nanotime() |
| 156 | + |
| 157 | + lock(&scavenge.lock) |
| 158 | + |
| 159 | + // Update all the pacing parameters in mheap with scavenge.lock held, |
| 160 | + // so that scavenge.gen is kept in sync with the updated values. |
| 161 | + mheap_.scavengeRetainedGoal = retainedGoal |
| 162 | + mheap_.scavengeRetainedBasis = retainedNow |
| 163 | + mheap_.scavengeTimeBasis = now |
| 164 | + mheap_.scavengeBytesPerNS = float64(totalWork) / float64(totalTime) |
| 165 | + scavenge.gen++ // increase scavenge generation |
| 166 | + |
| 167 | + // Wake up background scavenger if needed, since the pacing was just updated. |
| 168 | + wakeScavengerLocked() |
| 169 | + |
| 170 | + unlock(&scavenge.lock) |
| 171 | +} |
| 172 | + |
| 173 | +// State of the background scavenger. |
| 174 | +var scavenge struct { |
| 175 | + lock mutex |
| 176 | + g *g |
| 177 | + parked bool |
| 178 | + timer *timer |
| 179 | + gen uint32 // read with either lock or mheap_.lock, write with both |
| 180 | +} |
| 181 | + |
| 182 | +// wakeScavengerLocked unparks the scavenger if necessary. It must be called |
| 183 | +// after any pacing update. |
| 184 | +// |
| 185 | +// scavenge.lock must be held. |
| 186 | +func wakeScavengerLocked() { |
| 187 | + if scavenge.parked { |
| 188 | + // Try to stop the timer but we don't really care if we succeed. |
| 189 | + // It's possible that either a timer was never started, or that |
| 190 | + // we're racing with it. |
| 191 | + // In the case that we're racing with there's the low chance that |
| 192 | + // we experience a spurious wake-up of the scavenger, but that's |
| 193 | + // totally safe. |
| 194 | + stopTimer(scavenge.timer) |
| 195 | + |
| 196 | + // Unpark the goroutine and tell it that there may have been a pacing |
| 197 | + // change. |
| 198 | + scavenge.parked = false |
| 199 | + ready(scavenge.g, 0, true) |
| 200 | + } |
| 201 | +} |
| 202 | + |
| 203 | +// scavengeSleep attempts to put the scavenger to sleep for ns. |
| 204 | +// It also checks to see if gen != scavenge.gen before going to sleep, |
| 205 | +// and aborts if true (meaning an update had occurred). |
| 206 | +// |
| 207 | +// Note that this function should only be called by the scavenger. |
| 208 | +// |
| 209 | +// The scavenger may be woken up earlier by a pacing change, and it may not go |
| 210 | +// to sleep at all if there's a pending pacing change. |
| 211 | +// |
| 212 | +// Returns false if awoken early (i.e. true means a complete sleep). |
| 213 | +func scavengeSleep(gen uint32, ns int64) bool { |
| 214 | + lock(&scavenge.lock) |
| 215 | + |
| 216 | + // If there was an update, just abort the sleep. |
| 217 | + if scavenge.gen != gen { |
| 218 | + unlock(&scavenge.lock) |
| 219 | + return false |
| 220 | + } |
| 221 | + |
| 222 | + // Set the timer. |
| 223 | + now := nanotime() |
| 224 | + scavenge.timer.when = now + ns |
| 225 | + startTimer(scavenge.timer) |
| 226 | + |
| 227 | + // Park the goroutine. It's fine that we don't publish the |
| 228 | + // fact that the timer was set; even if the timer wakes up |
| 229 | + // and fire scavengeReady before we park, it'll block on |
| 230 | + // scavenge.lock. |
| 231 | + scavenge.parked = true |
| 232 | + goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2) |
| 233 | + |
| 234 | + // Return true if we completed the full sleep. |
| 235 | + return (nanotime() - now) >= ns |
| 236 | +} |
| 237 | + |
| 238 | +// Background scavenger. |
| 239 | +// |
| 240 | +// The background scavenger maintains the RSS of the application below |
| 241 | +// the line described by the proportional scavenging statistics in |
| 242 | +// the mheap struct. |
| 243 | +func bgscavenge(c chan int) { |
| 244 | + scavenge.g = getg() |
| 245 | + |
| 246 | + lock(&scavenge.lock) |
| 247 | + scavenge.parked = true |
| 248 | + |
| 249 | + scavenge.timer = new(timer) |
| 250 | + scavenge.timer.f = func(_ interface{}, _ uintptr) { |
| 251 | + lock(&scavenge.lock) |
| 252 | + wakeScavengerLocked() |
| 253 | + unlock(&scavenge.lock) |
| 254 | + } |
| 255 | + |
| 256 | + c <- 1 |
| 257 | + goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1) |
| 258 | + |
| 259 | + // Parameters for sleeping. |
| 260 | + // |
| 261 | + // If we end up doing more work than we need, we should avoid spinning |
| 262 | + // until we have more work to do: instead, we know exactly how much time |
| 263 | + // until more work will need to be done, so we sleep. |
| 264 | + // |
| 265 | + // We should avoid sleeping for less than minSleepNS because Gosched() |
| 266 | + // overheads among other things will work out better in that case. |
| 267 | + // |
| 268 | + // There's no reason to set a maximum on sleep time because we'll always |
| 269 | + // get woken up earlier if there's any kind of update that could change |
| 270 | + // the scavenger's pacing. |
| 271 | + // |
| 272 | + // retryDelayNS tracks how much to sleep next time we fail to do any |
| 273 | + // useful work. |
| 274 | + const minSleepNS = int64(100 * 1000) // 100 µs |
| 275 | + |
| 276 | + retryDelayNS := minSleepNS |
| 277 | + |
| 278 | + for { |
| 279 | + released := uintptr(0) |
| 280 | + park := false |
| 281 | + ttnext := int64(0) |
| 282 | + gen := uint32(0) |
| 283 | + |
| 284 | + // Run on the system stack since we grab the heap lock, |
| 285 | + // and a stack growth with the heap lock means a deadlock. |
| 286 | + systemstack(func() { |
| 287 | + lock(&mheap_.lock) |
| 288 | + |
| 289 | + gen = scavenge.gen |
| 290 | + |
| 291 | + // If background scavenging is disabled or if there's no work to do just park. |
| 292 | + retained := heapRetained() |
| 293 | + if mheap_.scavengeBytesPerNS == 0 || retained <= mheap_.scavengeRetainedGoal { |
| 294 | + unlock(&mheap_.lock) |
| 295 | + park = true |
| 296 | + return |
| 297 | + } |
| 298 | + |
| 299 | + // Calculate how big we want the retained heap to be |
| 300 | + // at this point in time. |
| 301 | + // |
| 302 | + // The formula is for that of a line, y = b - mx |
| 303 | + // We want y (want), |
| 304 | + // m = scavengeBytesPerNS (> 0) |
| 305 | + // x = time between scavengeTimeBasis and now |
| 306 | + // b = scavengeRetainedBasis |
| 307 | + rate := mheap_.scavengeBytesPerNS |
| 308 | + tdist := nanotime() - mheap_.scavengeTimeBasis |
| 309 | + rdist := uint64(rate * float64(tdist)) |
| 310 | + want := mheap_.scavengeRetainedBasis - rdist |
| 311 | + |
| 312 | + // If we're above the line, scavenge to get below the |
| 313 | + // line. |
| 314 | + if retained > want { |
| 315 | + released = mheap_.scavengeLocked(uintptr(retained - want)) |
| 316 | + } |
| 317 | + unlock(&mheap_.lock) |
| 318 | + |
| 319 | + // If we over-scavenged a bit, calculate how much time it'll |
| 320 | + // take at the current rate for us to make that up. We definitely |
| 321 | + // won't have any work to do until at least that amount of time |
| 322 | + // passes. |
| 323 | + if released > uintptr(retained-want) { |
| 324 | + extra := released - uintptr(retained-want) |
| 325 | + ttnext = int64(float64(extra) / rate) |
| 326 | + } |
| 327 | + }) |
| 328 | + |
| 329 | + if park { |
| 330 | + lock(&scavenge.lock) |
| 331 | + scavenge.parked = true |
| 332 | + goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1) |
| 333 | + continue |
| 334 | + } |
| 335 | + |
| 336 | + if debug.gctrace > 0 { |
| 337 | + if released > 0 { |
| 338 | + print("scvg: ", released>>20, " MB released\n") |
| 339 | + } |
| 340 | + print("scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n") |
| 341 | + } |
| 342 | + |
| 343 | + if released == 0 { |
| 344 | + // If we were unable to release anything this may be because there's |
| 345 | + // no free memory available to scavenge. Go to sleep and try again. |
| 346 | + if scavengeSleep(gen, retryDelayNS) { |
| 347 | + // If we successfully slept through the delay, back off exponentially. |
| 348 | + retryDelayNS *= 2 |
| 349 | + } |
| 350 | + continue |
| 351 | + } |
| 352 | + retryDelayNS = minSleepNS |
| 353 | + |
| 354 | + if ttnext > 0 && ttnext > minSleepNS { |
| 355 | + // If there's an appreciable amount of time until the next scavenging |
| 356 | + // goal, just sleep. We'll get woken up if anything changes and this |
| 357 | + // way we avoid spinning. |
| 358 | + scavengeSleep(gen, ttnext) |
| 359 | + continue |
| 360 | + } |
| 361 | + |
| 362 | + // Give something else a chance to run, no locks are held. |
| 363 | + Gosched() |
| 364 | + } |
| 365 | +} |
0 commit comments