[CUDACachingAlloc/GPUInference] Implement garbage collection without GPU sync (#74261)

jaewonlee-fb · facebook-github-bot · commit 05780f1ed4b1 · 2022-03-21T11:40:09.000-07:00
Summary: Pull Request resolved: #74261 ### Goal Implement a cheap way to reclaim GPU memory (garbage collection) without incurring GPU sync. ### Why do we need this? Currently, there are only two ways to reclaim GPU memory block already assigned to a particular stream. - `release_available_cached_blocks(params)`: Free blocks exceeding the `CachingAllocatorConfig::max_split_size()` until we can satisfy the request. Issue: If the `max_split_size` is unset (default), this function is a no-op. Even if this is set, the reclamation is quite conservative (e.g., never frees blocks under max_split_size). - `release_cached_blocks()`: Waits for all the in-flight events and then reclaim blocks. Issue: 'waiting for all event' is very expensive as it will likely stall all the GPU operations. Many GPU applications without a proper handling of potential GPU throttling would suffer/crash. ### Proposed idea - If the garbage collection threshold is set, try to reclaim some memory blocks *without* synchronization. It should be safe to do so, as `release_available_cached_blocks` essentially does the same thing (but less aggressively). - GC is triggered only when we fail to serve a `malloc` request from the block pool. No need to free blocks when the block pool is functioning just fine. - Prioritize reclaiming blocks that weren't reused for long time. Reclamation stops once the used memory capacity < threshold. - This code path is totally optional; by default it won't be invoked. Test Plan: - Unit tests - Manually checked that the GPU memory usage stays as indicated by the garbage collector. If not the caching allocator at least tries to keep freeing the blocks. Reviewed By: jianyuh Differential Revision: D34482514 fbshipit-source-id: d5eae62ac60b94b0bca851f9d233a092d086e3c2
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
@@ -178,6 +178,8 @@ struct Block {
   Block* prev; // prev block if split from a larger allocation
   Block* next; // next block if split from a larger allocation
   int event_count; // number of outstanding CUDA events
+  int gc_count; // counter for prioritizing older / less useful blocks for
+                // garbage collection
 
   Block(
       int device,
@@ -194,7 +196,8 @@ struct Block {
         allocated(0),
         prev(nullptr),
         next(nullptr),
-        event_count(0) {}
+        event_count(0),
+        gc_count(0) {}
 
   // constructor for search key
   Block(int device, cudaStream_t stream, size_t size)
@@ -207,7 +210,8 @@ struct Block {
         allocated(0),
         prev(nullptr),
         next(nullptr),
-        event_count(0) {}
+        event_count(0),
+        gc_count(0) {}
 
   bool is_split() const {
     return (prev != nullptr) || (next != nullptr);
@@ -331,6 +335,9 @@ class CachingAllocatorConfig {
   static size_t max_split_size() {
     return instance().m_max_split_size;
   }
+  static double garbage_collection_threshold() {
+    return instance().m_garbage_collection_threshold;
+  }
 
   // This is used to round-up allocation size to nearest power of 2 divisions.
   // More description below in function roundup_power2_next_division
@@ -352,9 +359,11 @@ class CachingAllocatorConfig {
 
   CachingAllocatorConfig()
       : m_max_split_size(std::numeric_limits<size_t>::max()),
-        m_roundup_power2_divisions(0) {}
+        m_roundup_power2_divisions(0),
+        m_garbage_collection_threshold(0) {}
   size_t m_max_split_size;
   size_t m_roundup_power2_divisions;
+  double m_garbage_collection_threshold;
 
   void parseArgs() {
     const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
@@ -391,6 +400,25 @@ class CachingAllocatorConfig {
                 "For roundups, the divisons has to be power of 2 ",
                 "");
             m_roundup_power2_divisions = val2;
+          } else if (kv[0].compare("garbage_collection_threshold") == 0) {
+            /*
+             * Perform garbage collection of GPU memory blocks to avoid
+             * triggering expensive sync-and-reclaim-all operation. Upon setting
+             * the threshold (e.g., 0.8), the allocator will start reclaiming
+             * blocks if GPU memory capacity usage exceeds the threshold (i.e.,
+             * 80% of total memory).
+             * Values 0.0 and 1.0 are not allowed as they are less meaningful.
+             */
+            double val2 = stod(kv[1]);
+            TORCH_CHECK(
+                val2 > 0,
+                "garbage_collect_threshold too small, set it 0.0~1.0",
+                "");
+            TORCH_CHECK(
+                val2 < 1.0,
+                "garbage_collect_threshold too big, set it 0.0~1.0",
+                "");
+            m_garbage_collection_threshold = val2;
           } else {
             TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
           }
@@ -487,18 +515,29 @@ class DeviceCachingAllocator {
     params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
     params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
 
+    // First, try to get a block from the existing pool.
     bool block_found =
         // Search pool
         get_free_block(params)
         // Trigger callbacks and retry search
-        || (trigger_free_memory_callbacks(params) && get_free_block(params))
-        // Attempt allocate
-        || alloc_block(params, false)
-        // Free enough available cached blocks to satisfy alloc and retry alloc.
-        ||
-        (release_available_cached_blocks(params) && alloc_block(params, false))
-        // Free all non-split cached blocks and retry alloc.
-        || (release_cached_blocks() && alloc_block(params, true));
+        || (trigger_free_memory_callbacks(params) && get_free_block(params));
+
+    // Can't reuse an existing block; try to get a new one.
+    if (!block_found) {
+      // Do garbage collection if the flag is set.
+      if (C10_UNLIKELY(
+              CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+        garbage_collect_cached_blocks();
+      }
+      // Attempt allocate
+      block_found = alloc_block(params, false)
+          // Free enough available cached blocks to satisfy alloc and retry
+          // alloc.
+          || (release_available_cached_blocks(params) &&
+              alloc_block(params, false))
+          // Free all non-split cached blocks and retry alloc.
+          || (release_cached_blocks() && alloc_block(params, true));
+    }
 
     if (!block_found) {
       // For any error code other than cudaErrorMemoryAllocation,
@@ -1087,6 +1126,14 @@ class DeviceCachingAllocator {
 
   bool get_free_block(AllocParams& p) {
     BlockPool& pool = *p.pool;
+
+    if (C10_UNLIKELY(
+            CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+      // Track block reuse interval only when garbage collection is enabled.
+      for (auto& b : pool.blocks) {
+        ++b->gc_count;
+      }
+    }
     auto it = pool.blocks.lower_bound(&p.search_key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream())
       return false;
@@ -1099,6 +1146,7 @@ class DeviceCachingAllocator {
         ((*it)->size >= p.size() + kLargeBuffer))
       return false;
     p.block = *it;
+    (*it)->gc_count = 0; // Denote this block has been used
     pool.blocks.erase(it);
     return true;
   }
@@ -1112,6 +1160,62 @@ class DeviceCachingAllocator {
     return freed_memory;
   }
 
+  void garbage_collect_cached_blocks() {
+    // Free unused cached blocks to reclaim GPU memory.
+    // Unlike release_cached_blocks(), this does not enforce synchronization and
+    // therefore should be of less overheads.
+
+    size_t gc_threshold = static_cast<size_t>(
+        CachingAllocatorConfig::garbage_collection_threshold() *
+        allowed_memory_maximum);
+    // No need to trigger GC yet
+    if (total_allocated_memory <= gc_threshold) {
+      return;
+    }
+    const auto target_size = total_allocated_memory - gc_threshold;
+    size_t gc_reclaimed = 0;
+
+    // Calculate the total age of the free-able blocks. We'll use it later to
+    // get "avg age" threshold.
+    double total_age = 0.0;
+    int freeable_block_count = 0;
+    for (auto& b : large_blocks.blocks) {
+      if (!b->is_split()) {
+        total_age += b->gc_count;
+        ++freeable_block_count;
+      }
+    }
+    // No free-able blocks?
+    if (freeable_block_count == 0) {
+      return;
+    }
+
+    // Repeat GC until we reach reclaim > target size.
+    bool block_freed = true;
+    while (gc_reclaimed < target_size && block_freed == true &&
+           freeable_block_count > 0) {
+      // Free blocks exceeding this age threshold first.
+      double age_threshold = total_age / freeable_block_count;
+      // Stop iteration if we can no longer free a block.
+      block_freed = false;
+
+      // Free blocks of > avg age. Don't stop upon reaching the target_size,
+      // we don't want this GC to be triggered frequently.
+      auto it = large_blocks.blocks.begin();
+      while (it != large_blocks.blocks.end()) {
+        Block* block = *it;
+        ++it;
+        if (!block->is_split() && block->gc_count >= age_threshold) {
+          block_freed = true;
+          gc_reclaimed += block->size;
+          total_age -= block->gc_count; // Decrement the age
+          freeable_block_count--; // One less block that can be freed
+          release_block(block);
+        }
+      }
+    }
+  }
+
   bool alloc_block(AllocParams& p, bool isRetry) {
     // Defensively checks for preexisting CUDA error state.
     C10_CUDA_CHECK(cudaGetLastError());
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
@@ -102,6 +102,7 @@ struct DeviceStats {
 // cudaMalloc)..
 struct BlockInfo {
   int64_t size = 0;
+  int32_t gc_counter = 0;
   bool allocated = false;
   bool active = false;
 };
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
@@ -376,6 +376,14 @@ Available options:
   the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
   them, the values are 1024, 1280, 1536, and 1792. So, allocation size of 1200
   will be rounded to 1280 as the nearest ceiling of power-2 division.
+* ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
+  avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks),
+  which can be unfavorable to latency-critical GPU applications (e.g., servers).
+  Upon setting this threshold (e.g., 0.8), the allocator will start reclaiming
+  GPU memory blocks if the GPU memory capacity usage exceeds the threshold (i.e.,
+  80% of the total memory allocated to the GPU application). The algorithm prefers
+  to free old & unused blocks first to avoid freeing blocks that are actively being
+  reused. The threshold value should be between greater than 0.0 and less than 1.0.
 
 .. _cufft-plan-cache: