Skip to content

Commit 05780f1

Browse files
jaewonlee-fbfacebook-github-bot
authored andcommitted
[CUDACachingAlloc/GPUInference] Implement garbage collection without GPU sync (#74261)
Summary: Pull Request resolved: #74261 ### Goal Implement a cheap way to reclaim GPU memory (garbage collection) without incurring GPU sync. ### Why do we need this? Currently, there are only two ways to reclaim GPU memory block already assigned to a particular stream. - `release_available_cached_blocks(params)`: Free blocks exceeding the `CachingAllocatorConfig::max_split_size()` until we can satisfy the request. Issue: If the `max_split_size` is unset (default), this function is a no-op. Even if this is set, the reclamation is quite conservative (e.g., never frees blocks under max_split_size). - `release_cached_blocks()`: Waits for all the in-flight events and then reclaim blocks. Issue: 'waiting for all event' is very expensive as it will likely stall all the GPU operations. Many GPU applications without a proper handling of potential GPU throttling would suffer/crash. ### Proposed idea - If the garbage collection threshold is set, try to reclaim some memory blocks *without* synchronization. It should be safe to do so, as `release_available_cached_blocks` essentially does the same thing (but less aggressively). - GC is triggered only when we fail to serve a `malloc` request from the block pool. No need to free blocks when the block pool is functioning just fine. - Prioritize reclaiming blocks that weren't reused for long time. Reclamation stops once the used memory capacity < threshold. - This code path is totally optional; by default it won't be invoked. Test Plan: - Unit tests - Manually checked that the GPU memory usage stays as indicated by the garbage collector. If not the caching allocator at least tries to keep freeing the blocks. Reviewed By: jianyuh Differential Revision: D34482514 fbshipit-source-id: d5eae62ac60b94b0bca851f9d233a092d086e3c2
1 parent d85eb0f commit 05780f1

File tree

3 files changed

+124
-11
lines changed

3 files changed

+124
-11
lines changed

c10/cuda/CUDACachingAllocator.cpp

Lines changed: 115 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ struct Block {
178178
Block* prev; // prev block if split from a larger allocation
179179
Block* next; // next block if split from a larger allocation
180180
int event_count; // number of outstanding CUDA events
181+
int gc_count; // counter for prioritizing older / less useful blocks for
182+
// garbage collection
181183

182184
Block(
183185
int device,
@@ -194,7 +196,8 @@ struct Block {
194196
allocated(0),
195197
prev(nullptr),
196198
next(nullptr),
197-
event_count(0) {}
199+
event_count(0),
200+
gc_count(0) {}
198201

199202
// constructor for search key
200203
Block(int device, cudaStream_t stream, size_t size)
@@ -207,7 +210,8 @@ struct Block {
207210
allocated(0),
208211
prev(nullptr),
209212
next(nullptr),
210-
event_count(0) {}
213+
event_count(0),
214+
gc_count(0) {}
211215

212216
bool is_split() const {
213217
return (prev != nullptr) || (next != nullptr);
@@ -331,6 +335,9 @@ class CachingAllocatorConfig {
331335
static size_t max_split_size() {
332336
return instance().m_max_split_size;
333337
}
338+
static double garbage_collection_threshold() {
339+
return instance().m_garbage_collection_threshold;
340+
}
334341

335342
// This is used to round-up allocation size to nearest power of 2 divisions.
336343
// More description below in function roundup_power2_next_division
@@ -352,9 +359,11 @@ class CachingAllocatorConfig {
352359

353360
CachingAllocatorConfig()
354361
: m_max_split_size(std::numeric_limits<size_t>::max()),
355-
m_roundup_power2_divisions(0) {}
362+
m_roundup_power2_divisions(0),
363+
m_garbage_collection_threshold(0) {}
356364
size_t m_max_split_size;
357365
size_t m_roundup_power2_divisions;
366+
double m_garbage_collection_threshold;
358367

359368
void parseArgs() {
360369
const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
@@ -391,6 +400,25 @@ class CachingAllocatorConfig {
391400
"For roundups, the divisons has to be power of 2 ",
392401
"");
393402
m_roundup_power2_divisions = val2;
403+
} else if (kv[0].compare("garbage_collection_threshold") == 0) {
404+
/*
405+
* Perform garbage collection of GPU memory blocks to avoid
406+
* triggering expensive sync-and-reclaim-all operation. Upon setting
407+
* the threshold (e.g., 0.8), the allocator will start reclaiming
408+
* blocks if GPU memory capacity usage exceeds the threshold (i.e.,
409+
* 80% of total memory).
410+
* Values 0.0 and 1.0 are not allowed as they are less meaningful.
411+
*/
412+
double val2 = stod(kv[1]);
413+
TORCH_CHECK(
414+
val2 > 0,
415+
"garbage_collect_threshold too small, set it 0.0~1.0",
416+
"");
417+
TORCH_CHECK(
418+
val2 < 1.0,
419+
"garbage_collect_threshold too big, set it 0.0~1.0",
420+
"");
421+
m_garbage_collection_threshold = val2;
394422
} else {
395423
TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
396424
}
@@ -487,18 +515,29 @@ class DeviceCachingAllocator {
487515
params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
488516
params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
489517

518+
// First, try to get a block from the existing pool.
490519
bool block_found =
491520
// Search pool
492521
get_free_block(params)
493522
// Trigger callbacks and retry search
494-
|| (trigger_free_memory_callbacks(params) && get_free_block(params))
495-
// Attempt allocate
496-
|| alloc_block(params, false)
497-
// Free enough available cached blocks to satisfy alloc and retry alloc.
498-
||
499-
(release_available_cached_blocks(params) && alloc_block(params, false))
500-
// Free all non-split cached blocks and retry alloc.
501-
|| (release_cached_blocks() && alloc_block(params, true));
523+
|| (trigger_free_memory_callbacks(params) && get_free_block(params));
524+
525+
// Can't reuse an existing block; try to get a new one.
526+
if (!block_found) {
527+
// Do garbage collection if the flag is set.
528+
if (C10_UNLIKELY(
529+
CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
530+
garbage_collect_cached_blocks();
531+
}
532+
// Attempt allocate
533+
block_found = alloc_block(params, false)
534+
// Free enough available cached blocks to satisfy alloc and retry
535+
// alloc.
536+
|| (release_available_cached_blocks(params) &&
537+
alloc_block(params, false))
538+
// Free all non-split cached blocks and retry alloc.
539+
|| (release_cached_blocks() && alloc_block(params, true));
540+
}
502541

503542
if (!block_found) {
504543
// For any error code other than cudaErrorMemoryAllocation,
@@ -1087,6 +1126,14 @@ class DeviceCachingAllocator {
10871126

10881127
bool get_free_block(AllocParams& p) {
10891128
BlockPool& pool = *p.pool;
1129+
1130+
if (C10_UNLIKELY(
1131+
CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
1132+
// Track block reuse interval only when garbage collection is enabled.
1133+
for (auto& b : pool.blocks) {
1134+
++b->gc_count;
1135+
}
1136+
}
10901137
auto it = pool.blocks.lower_bound(&p.search_key);
10911138
if (it == pool.blocks.end() || (*it)->stream != p.stream())
10921139
return false;
@@ -1099,6 +1146,7 @@ class DeviceCachingAllocator {
10991146
((*it)->size >= p.size() + kLargeBuffer))
11001147
return false;
11011148
p.block = *it;
1149+
(*it)->gc_count = 0; // Denote this block has been used
11021150
pool.blocks.erase(it);
11031151
return true;
11041152
}
@@ -1112,6 +1160,62 @@ class DeviceCachingAllocator {
11121160
return freed_memory;
11131161
}
11141162

1163+
void garbage_collect_cached_blocks() {
1164+
// Free unused cached blocks to reclaim GPU memory.
1165+
// Unlike release_cached_blocks(), this does not enforce synchronization and
1166+
// therefore should be of less overheads.
1167+
1168+
size_t gc_threshold = static_cast<size_t>(
1169+
CachingAllocatorConfig::garbage_collection_threshold() *
1170+
allowed_memory_maximum);
1171+
// No need to trigger GC yet
1172+
if (total_allocated_memory <= gc_threshold) {
1173+
return;
1174+
}
1175+
const auto target_size = total_allocated_memory - gc_threshold;
1176+
size_t gc_reclaimed = 0;
1177+
1178+
// Calculate the total age of the free-able blocks. We'll use it later to
1179+
// get "avg age" threshold.
1180+
double total_age = 0.0;
1181+
int freeable_block_count = 0;
1182+
for (auto& b : large_blocks.blocks) {
1183+
if (!b->is_split()) {
1184+
total_age += b->gc_count;
1185+
++freeable_block_count;
1186+
}
1187+
}
1188+
// No free-able blocks?
1189+
if (freeable_block_count == 0) {
1190+
return;
1191+
}
1192+
1193+
// Repeat GC until we reach reclaim > target size.
1194+
bool block_freed = true;
1195+
while (gc_reclaimed < target_size && block_freed == true &&
1196+
freeable_block_count > 0) {
1197+
// Free blocks exceeding this age threshold first.
1198+
double age_threshold = total_age / freeable_block_count;
1199+
// Stop iteration if we can no longer free a block.
1200+
block_freed = false;
1201+
1202+
// Free blocks of > avg age. Don't stop upon reaching the target_size,
1203+
// we don't want this GC to be triggered frequently.
1204+
auto it = large_blocks.blocks.begin();
1205+
while (it != large_blocks.blocks.end()) {
1206+
Block* block = *it;
1207+
++it;
1208+
if (!block->is_split() && block->gc_count >= age_threshold) {
1209+
block_freed = true;
1210+
gc_reclaimed += block->size;
1211+
total_age -= block->gc_count; // Decrement the age
1212+
freeable_block_count--; // One less block that can be freed
1213+
release_block(block);
1214+
}
1215+
}
1216+
}
1217+
}
1218+
11151219
bool alloc_block(AllocParams& p, bool isRetry) {
11161220
// Defensively checks for preexisting CUDA error state.
11171221
C10_CUDA_CHECK(cudaGetLastError());

c10/cuda/CUDACachingAllocator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ struct DeviceStats {
102102
// cudaMalloc)..
103103
struct BlockInfo {
104104
int64_t size = 0;
105+
int32_t gc_counter = 0;
105106
bool allocated = false;
106107
bool active = false;
107108
};

docs/source/notes/cuda.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,14 @@ Available options:
376376
the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
377377
them, the values are 1024, 1280, 1536, and 1792. So, allocation size of 1200
378378
will be rounded to 1280 as the nearest ceiling of power-2 division.
379+
* ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
380+
avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks),
381+
which can be unfavorable to latency-critical GPU applications (e.g., servers).
382+
Upon setting this threshold (e.g., 0.8), the allocator will start reclaiming
383+
GPU memory blocks if the GPU memory capacity usage exceeds the threshold (i.e.,
384+
80% of the total memory allocated to the GPU application). The algorithm prefers
385+
to free old & unused blocks first to avoid freeing blocks that are actively being
386+
reused. The threshold value should be between greater than 0.0 and less than 1.0.
379387

380388
.. _cufft-plan-cache:
381389

0 commit comments

Comments
 (0)