6
6
#include < ATen/mps/MPSEvent.h>
7
7
#include < ATen/mps/MPSStream.h>
8
8
9
+ #include < c10/util/flat_hash_map.h>
10
+ #include < mach/vm_page_size.h>
9
11
#include < cstdio>
10
12
#include < mutex>
11
13
#include < set>
12
14
#include < unordered_set>
13
- #include < mach/vm_page_size.h>
14
- #include < c10/util/flat_hash_map.h>
15
15
16
16
// this implementation is based on CUDACachingAllocator.
17
17
// It utilizes Metal Heaps to improve the performance with buffer allocation.
18
18
// Do not include this header. Use MPSAllocatorInterface.h instead.
19
19
// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
20
20
namespace at ::mps::HeapAllocator {
21
21
22
- static const size_t kMaxSmallAlloc = MB(1 ); // largest "small" allocation is 1 MiB
23
- static const size_t kMinLargeAlloc = MB(10 ); // allocations between 1 and 10 MiB may use kLargeHeap
24
- static const size_t kRoundLarge = MB(2 ); // round up large allocations to 2 MiB
25
- static const size_t kSmallHeap = MB(8 ); // "small" allocations are packed in 8 MiB heaps
26
- static const size_t kLargeHeap = MB(32 ); // "large" allocations may be packed in 32 MiB heaps
27
- static const size_t kXLargeHeapD = MB(128 ); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
28
- static const size_t kXLargeHeapU = MB(1024 ); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
22
+ static const size_t kMaxSmallAlloc = MB(1 ); // largest "small" allocation is 1 MiB
23
+ static const size_t kMinLargeAlloc = MB(10 ); // allocations between 1 and 10 MiB may use kLargeHeap
24
+ static const size_t kRoundLarge = MB(2 ); // round up large allocations to 2 MiB
25
+ static const size_t kSmallHeap = MB(8 ); // "small" allocations are packed in 8 MiB heaps
26
+ static const size_t kLargeHeap = MB(32 ); // "large" allocations may be packed in 32 MiB heaps
27
+ static const size_t kXLargeHeapD =
28
+ MB (128 ); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
29
+ static const size_t kXLargeHeapU =
30
+ MB (1024 ); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
29
31
static const size_t kMaxScalarAlloc = (sizeof (int64_t )); // largest "scalar" allocation
30
32
31
33
// buffer pools could be customized with a combination of usage flags
32
34
enum UsageFlags : uint32_t {
33
35
PRIVATE = 0 ,
34
- SMALL = (1 << 0 ), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
35
- SHARED = (1 << 1 ), // shared pools allocated on devices with unified memory; otherwise, private between host/device
36
+ SMALL = (1 << 0 ), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
37
+ SHARED = (1 << 1 ), // shared pools allocated on devices with unified memory; otherwise, private between host/device
36
38
MANAGED = (1 << 2 ), // managed storage mode
37
- HAZARD = (1 << 3 ), // enables Automatic Hazard Tracking for the resources allocated on the pool
38
- SCALAR = (1 << 4 ), // used to import CPU scalar values to GPU and use them in MPS Stream
39
+ HAZARD = (1 << 3 ), // enables Automatic Hazard Tracking for the resources allocated on the pool
40
+ SCALAR = (1 << 4 ), // used to import CPU scalar values to GPU and use them in MPS Stream
39
41
};
40
42
// debug verbosity flags
41
43
enum DebugVerbosity : uint32_t {
42
- SILENT = 0 ,
43
- PROFILING = (1 << 0 ), // print generic profiling data for total system memory usage
44
+ SILENT = 0 ,
45
+ PROFILING = (1 << 0 ), // print generic profiling data for total system memory usage
44
46
ALLOCATIONS = (1 << 1 ), // print buffer allocations
45
- RECYCLES = (1 << 2 ), // print buffer recycling
46
- RELEASES = (1 << 3 ), // print buffer releases
47
- LARGE_ONLY = (1 << 4 ), // only log large buffer pool transactions
47
+ RECYCLES = (1 << 2 ), // print buffer recycling
48
+ RELEASES = (1 << 3 ), // print buffer releases
49
+ LARGE_ONLY = (1 << 4 ), // only log large buffer pool transactions
48
50
};
49
51
50
52
struct HeapBlock ;
@@ -67,10 +69,8 @@ struct BufferBlock {
67
69
// Metal events used to sync GPU/CPU operations on the shared-storage buffers
68
70
MPSEventPtr event;
69
71
70
- BufferBlock (size_t Size, size_t RequestedSize = 0 , const id<MTLBuffer> Buffer = nullptr ,
71
- HeapBlock* Heap = nullptr ) :
72
- buffer (Buffer), size(Size), requested_size(RequestedSize),
73
- heap (Heap), buf_id(Buffer ? ++buffer_counter : 0 ) { }
72
+ BufferBlock (size_t Size, size_t RequestedSize = 0 , const id<MTLBuffer> Buffer = nullptr , HeapBlock* Heap = nullptr )
73
+ : buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0 ) {}
74
74
75
75
static bool Comparator (const BufferBlock* a, const BufferBlock* b) {
76
76
return (a->size != b->size ) ? a->size < b->size : (uintptr_t )a->buffer < (uintptr_t )b->buffer ;
@@ -79,15 +79,19 @@ struct BufferBlock {
79
79
assert (((Alignment - 1 ) & Alignment) == 0 );
80
80
return ((Size + Alignment - 1 ) & ~(Alignment - 1 ));
81
81
}
82
- uint32_t retainCount () const { return [buffer retainCount]; }
82
+ uint32_t retainCount () const {
83
+ return [buffer retainCount];
84
+ }
83
85
};
84
86
typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
85
87
86
88
struct BufferPool ;
87
89
struct AllocParams {
88
- AllocParams (size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
89
- search_key (Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
90
- size_t size () const { return search_key.size ; }
90
+ AllocParams (size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool)
91
+ : search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) {}
92
+ size_t size () const {
93
+ return search_key.size ;
94
+ }
91
95
92
96
BufferBlock search_key;
93
97
BufferPool* pool;
@@ -102,7 +106,9 @@ struct AllocParams {
102
106
103
107
struct HeapBlock {
104
108
id<MTLHeap> heap;
105
- struct { size_t total, available; } size;
109
+ struct {
110
+ size_t total, available;
111
+ } size;
106
112
BufferPool* pool;
107
113
unsigned int n_buffers = 0 ;
108
114
id_t heap_id;
@@ -111,9 +117,12 @@ struct HeapBlock {
111
117
// counter to assign unique ids to heap blocks
112
118
static uint64_t heap_counter;
113
119
114
- HeapBlock (size_t Size, const id<MTLHeap> Heap = nullptr , BufferPool *Pool = nullptr ) :
115
- heap (Heap), size({.total = Size, .available = Size}), pool(Pool),
116
- heap_id (Heap ? ++heap_counter : 0 ), is_split(true ) { }
120
+ HeapBlock (size_t Size, const id<MTLHeap> Heap = nullptr , BufferPool* Pool = nullptr )
121
+ : heap(Heap),
122
+ size ({.total = Size, .available = Size}),
123
+ pool(Pool),
124
+ heap_id(Heap ? ++heap_counter : 0 ),
125
+ is_split(true ) {}
117
126
118
127
static MTLResourceOptions getOptions (uint32_t usage) {
119
128
// TODO: check the caching performance of write-combined mode
@@ -126,16 +135,17 @@ struct HeapBlock {
126
135
else
127
136
options |= MTLResourceStorageModePrivate;
128
137
129
- options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
138
+ options |=
139
+ (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
130
140
131
141
return options;
132
142
}
133
143
134
144
static HeapBlock* createHeapBlock (AllocParams& params, id<MTLDevice> device, uint32_t usage) {
135
- HeapBlock * heapBlock = nullptr ;
145
+ HeapBlock* heapBlock = nullptr ;
136
146
bool is_split = true ;
137
147
const size_t size = params.size ();
138
- MTLHeapDescriptor * d = [MTLHeapDescriptor new];
148
+ MTLHeapDescriptor* d = [MTLHeapDescriptor new];
139
149
if (d) {
140
150
const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD ;
141
151
if (size <= kMaxSmallAlloc ) {
@@ -152,10 +162,11 @@ struct HeapBlock {
152
162
d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
153
163
// this automatically handles Metal buffer access synchronizations at the
154
164
// cost of slightly lower performance.
155
- d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
165
+ d.hazardTrackingMode =
166
+ (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
156
167
d.resourceOptions = getOptions (usage);
157
168
d.type = MTLHeapTypeAutomatic;
158
- id<MTLHeap> heap = [device newHeapWithDescriptor: d];
169
+ id<MTLHeap> heap = [device newHeapWithDescriptor:d];
159
170
if (heap) {
160
171
[heap setPurgeableState:MTLPurgeableStateNonVolatile];
161
172
const size_t heap_size = heapAvailableSize (heap);
@@ -169,8 +180,8 @@ struct HeapBlock {
169
180
return heapBlock;
170
181
}
171
182
static bool Comparator (const HeapBlock* a, const HeapBlock* b) {
172
- return (a->size .available != b->size .available ) ? a->size .available < b->size .available :
173
- (uintptr_t )a->heap < (uintptr_t )b->heap ;
183
+ return (a->size .available != b->size .available ) ? a->size .available < b->size .available
184
+ : (uintptr_t )a->heap < (uintptr_t )b->heap ;
174
185
}
175
186
static NSUInteger heapAvailableSize (id<MTLHeap> heap, size_t Alignment = vm_page_size) {
176
187
return [heap maxAvailableSizeWithAlignment:Alignment];
@@ -205,8 +216,12 @@ struct HeapBlock {
205
216
size.available = 0 ;
206
217
return retainCount;
207
218
}
208
- uint32_t retainCount () const { return [heap retainCount]; }
209
- void updateAvailableSize () { size.available = heapAvailableSize (heap); }
219
+ uint32_t retainCount () const {
220
+ return [heap retainCount];
221
+ }
222
+ void updateAvailableSize () {
223
+ size.available = heapAvailableSize (heap);
224
+ }
210
225
};
211
226
typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
212
227
@@ -219,9 +234,8 @@ struct BufferPool {
219
234
SCALAR,
220
235
};
221
236
222
- BufferPool (const id<MTLDevice> Device, uint32_t Usage) :
223
- device (Device), usage(Usage),
224
- heaps (HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
237
+ BufferPool (const id<MTLDevice> Device, uint32_t Usage)
238
+ : device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) {}
225
239
226
240
const id<MTLDevice> device;
227
241
// usage flags to customize the pool for various purposes (see UsageFlags enum)
@@ -248,12 +262,12 @@ struct BufferPool {
248
262
};
249
263
250
264
class MPSHeapAllocatorImpl {
251
- public:
252
- explicit MPSHeapAllocatorImpl () :
253
- m_device(at::mps::MPSDevice::getInstance()->device()),
254
- m_max_buffer_size([m_device maxBufferLength]),
255
- m_stream(getDefaultMPSStream()),
256
- m_event_pool(getMPSEventPool()) {
265
+ public:
266
+ explicit MPSHeapAllocatorImpl ()
267
+ : m_device(at::mps::MPSDevice::getInstance()->device()),
268
+ m_max_buffer_size([m_device maxBufferLength]),
269
+ m_stream(getDefaultMPSStream()),
270
+ m_event_pool(getMPSEventPool()) {
257
271
init_allocator ();
258
272
}
259
273
~MPSHeapAllocatorImpl () {
@@ -298,34 +312,50 @@ class MPSHeapAllocatorImpl {
298
312
// (see m_high_watermark_ratio for description)
299
313
void setHighWatermarkRatio (double ratio);
300
314
// (see m_low_watermark_limit for description)
301
- size_t getLowWatermarkLimit () const { return m_low_watermark_limit; }
315
+ size_t getLowWatermarkLimit () const {
316
+ return m_low_watermark_limit;
317
+ }
302
318
// (see m_max_total_allowed_size for description)
303
- size_t getHighWatermarkLimit () const { return m_max_total_allowed_size; }
319
+ size_t getHighWatermarkLimit () const {
320
+ return m_max_total_allowed_size;
321
+ }
304
322
// (see m_total_allocated_memory for description)
305
- size_t getTotalAllocatedMemory () const { return m_total_allocated_memory; }
323
+ size_t getTotalAllocatedMemory () const {
324
+ return m_total_allocated_memory;
325
+ }
306
326
// (see m_current_allocated_memory for description)
307
- size_t getCurrentAllocatedMemory () const { return m_current_allocated_memory; }
327
+ size_t getCurrentAllocatedMemory () const {
328
+ return m_current_allocated_memory;
329
+ }
308
330
// total GPU memory allocated in the process by Metal driver; including
309
331
// implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
310
- size_t getDriverAllocatedMemory () const { return current_allocated_size (); }
332
+ size_t getDriverAllocatedMemory () const {
333
+ return current_allocated_size ();
334
+ }
311
335
// recommended Max memory for Metal
312
- size_t getRecommendedMaxMemory () const { return max_device_size (); }
336
+ size_t getRecommendedMaxMemory () const {
337
+ return max_device_size ();
338
+ }
313
339
// (see enum DebugVerbosity for description)
314
- uint32_t getDebugVerbosity () const { return m_debug_verbosity; }
340
+ uint32_t getDebugVerbosity () const {
341
+ return m_debug_verbosity;
342
+ }
315
343
// returns the device that we allocate from
316
- inline id<MTLDevice> Device () const { return m_device; }
344
+ inline id<MTLDevice> Device () const {
345
+ return m_device;
346
+ }
317
347
318
348
// TODO: make a common function to do size unit conversions in PyTorch.
319
349
inline std::string format_size (uint64_t size) const ;
320
350
321
- private:
351
+ private:
322
352
// (see m_high_watermark_ratio for description)
323
353
constexpr static double default_high_watermark_ratio = 1.7 ;
324
354
// we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
325
355
constexpr static double default_high_watermark_upper_bound = 2.0 ;
326
356
// (see m_low_watermark_ratio for description)
327
357
// on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
328
- constexpr static double default_low_watermark_ratio_unified = 1.4 ;
358
+ constexpr static double default_low_watermark_ratio_unified = 1.4 ;
329
359
constexpr static double default_low_watermark_ratio_discrete = 1.0 ;
330
360
331
361
const id<MTLDevice> m_device;
@@ -387,14 +417,19 @@ class MPSHeapAllocatorImpl {
387
417
size_t get_allocation_size (size_t size, uint32_t usage) const ;
388
418
// maximum size of device memory available for allocation in current process
389
419
// Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
390
- size_t max_device_size () const { return [m_device recommendedMaxWorkingSetSize]; }
420
+ size_t max_device_size () const {
421
+ return [m_device recommendedMaxWorkingSetSize];
422
+ }
391
423
// there are implicit allocations from MPS backend, so we need to query the 'device' for
392
424
// total allocated size instead of manually tracking in MPSAllocator
393
- size_t current_allocated_size () const { return [m_device currentAllocatedSize]; }
425
+ size_t current_allocated_size () const {
426
+ return [m_device currentAllocatedSize];
427
+ }
394
428
395
429
bool trigger_memory_callbacks (BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
396
430
for (const auto & name : MPSAllocatorCallbacksRegistry ()->Keys ()) {
397
- MPSAllocatorCallbacksRegistry ()->Create (name)->executeMPSAllocatorCallback (buffer_block ? buffer_block->buffer : nullptr , event);
431
+ MPSAllocatorCallbacksRegistry ()->Create (name)->executeMPSAllocatorCallback (
432
+ buffer_block ? buffer_block->buffer : nullptr , event);
398
433
}
399
434
return true ;
400
435
}
0 commit comments