Skip to content

Commit afdd34b

Browse files
malfetpobin6
authored andcommitted
[BE][MPS] Apply clang-format to mps headers (pytorch#140906)
It was a mistake to amiss them in the past All changes in this PR except ones to .lintrunner.toml are generated by running `lintrunner -a --take CLANGFORMAT --all-files` Pull Request resolved: pytorch#140906 Approved by: https://github.com/Skylion007
1 parent 9695016 commit afdd34b

24 files changed

+932
-823
lines changed

.lintrunner.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,12 @@ code = 'CLANGFORMAT'
5656
include_patterns = [
5757
'aten/src/ATen/*.h',
5858
'aten/src/ATen/mps/**/*.mm',
59+
'aten/src/ATen/mps/**/*.h',
5960
'aten/src/ATen/xpu/**/*.h',
6061
'aten/src/ATen/xpu/**/*.cpp',
6162
'aten/src/ATen/native/mps/**/*.metal',
6263
'aten/src/ATen/native/mps/**/*.mm',
64+
'aten/src/ATen/native/mps/**/*.h',
6365
'aten/src/ATen/native/vulkan/**/*.h',
6466
'aten/src/ATen/native/vulkan/**/*.cpp',
6567
'aten/src/ATen/native/cuda/MultiTensorApply.cuh',

aten/src/ATen/mps/EmptyTensor.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ C10_EXPORT TensorBase empty_mps(
1212
std::optional<Device> device_opt,
1313
std::optional<bool> pin_memory_opt,
1414
std::optional<c10::MemoryFormat> memory_format_opt);
15-
C10_EXPORT TensorBase empty_mps(
16-
IntArrayRef size, const TensorOptions &options);
15+
C10_EXPORT TensorBase empty_mps(IntArrayRef size, const TensorOptions& options);
1716

1817
C10_EXPORT TensorBase empty_strided_mps(
1918
IntArrayRef size,
@@ -24,6 +23,6 @@ C10_EXPORT TensorBase empty_strided_mps(
2423
C10_EXPORT TensorBase empty_strided_mps(
2524
IntArrayRef size,
2625
IntArrayRef stride,
27-
const TensorOptions &options);
26+
const TensorOptions& options);
2827

2928
} // namespace at::detail

aten/src/ATen/mps/IndexKernels.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
namespace at::mps {
44

5-
static const char *SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
5+
static const char* SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
66
struct __attribute__ ((packed)) packed_uint5{{
77
uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
88
}};
@@ -120,7 +120,7 @@ kernel void scatter_kernel_1(uint linear_index [[thread_position_in
120120
}}
121121
)METAL_SCATTER";
122122

123-
static const char *GATHER_OPS_TEMPLATE = R"METAL_GATHER(
123+
static const char* GATHER_OPS_TEMPLATE = R"METAL_GATHER(
124124
struct __attribute__ ((packed)) packed_uint5{{
125125
uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
126126
}};

aten/src/ATen/mps/MPSAllocator.h

Lines changed: 96 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,47 @@
66
#include <ATen/mps/MPSEvent.h>
77
#include <ATen/mps/MPSStream.h>
88

9+
#include <c10/util/flat_hash_map.h>
10+
#include <mach/vm_page_size.h>
911
#include <cstdio>
1012
#include <mutex>
1113
#include <set>
1214
#include <unordered_set>
13-
#include <mach/vm_page_size.h>
14-
#include <c10/util/flat_hash_map.h>
1515

1616
// this implementation is based on CUDACachingAllocator.
1717
// It utilizes Metal Heaps to improve the performance with buffer allocation.
1818
// Do not include this header. Use MPSAllocatorInterface.h instead.
1919
// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
2020
namespace at::mps::HeapAllocator {
2121

22-
static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB
23-
static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap
24-
static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB
25-
static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps
26-
static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps
27-
static const size_t kXLargeHeapD = MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
28-
static const size_t kXLargeHeapU = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
22+
static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB
23+
static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap
24+
static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB
25+
static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps
26+
static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps
27+
static const size_t kXLargeHeapD =
28+
MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
29+
static const size_t kXLargeHeapU =
30+
MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
2931
static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation
3032

3133
// buffer pools could be customized with a combination of usage flags
3234
enum UsageFlags : uint32_t {
3335
PRIVATE = 0,
34-
SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
35-
SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
36+
SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
37+
SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
3638
MANAGED = (1 << 2), // managed storage mode
37-
HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
38-
SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
39+
HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
40+
SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
3941
};
4042
// debug verbosity flags
4143
enum DebugVerbosity : uint32_t {
42-
SILENT = 0,
43-
PROFILING = (1 << 0), // print generic profiling data for total system memory usage
44+
SILENT = 0,
45+
PROFILING = (1 << 0), // print generic profiling data for total system memory usage
4446
ALLOCATIONS = (1 << 1), // print buffer allocations
45-
RECYCLES = (1 << 2), // print buffer recycling
46-
RELEASES = (1 << 3), // print buffer releases
47-
LARGE_ONLY = (1 << 4), // only log large buffer pool transactions
47+
RECYCLES = (1 << 2), // print buffer recycling
48+
RELEASES = (1 << 3), // print buffer releases
49+
LARGE_ONLY = (1 << 4), // only log large buffer pool transactions
4850
};
4951

5052
struct HeapBlock;
@@ -67,10 +69,8 @@ struct BufferBlock {
6769
// Metal events used to sync GPU/CPU operations on the shared-storage buffers
6870
MPSEventPtr event;
6971

70-
BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
71-
HeapBlock* Heap = nullptr) :
72-
buffer(Buffer), size(Size), requested_size(RequestedSize),
73-
heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }
72+
BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr, HeapBlock* Heap = nullptr)
73+
: buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) {}
7474

7575
static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
7676
return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
@@ -79,15 +79,19 @@ struct BufferBlock {
7979
assert(((Alignment - 1) & Alignment) == 0);
8080
return ((Size + Alignment - 1) & ~(Alignment - 1));
8181
}
82-
uint32_t retainCount() const { return [buffer retainCount]; }
82+
uint32_t retainCount() const {
83+
return [buffer retainCount];
84+
}
8385
};
8486
typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
8587

8688
struct BufferPool;
8789
struct AllocParams {
88-
AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
89-
search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
90-
size_t size() const { return search_key.size; }
90+
AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool)
91+
: search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) {}
92+
size_t size() const {
93+
return search_key.size;
94+
}
9195

9296
BufferBlock search_key;
9397
BufferPool* pool;
@@ -102,7 +106,9 @@ struct AllocParams {
102106

103107
struct HeapBlock {
104108
id<MTLHeap> heap;
105-
struct { size_t total, available; } size;
109+
struct {
110+
size_t total, available;
111+
} size;
106112
BufferPool* pool;
107113
unsigned int n_buffers = 0;
108114
id_t heap_id;
@@ -111,9 +117,12 @@ struct HeapBlock {
111117
// counter to assign unique ids to heap blocks
112118
static uint64_t heap_counter;
113119

114-
HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
115-
heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
116-
heap_id(Heap ? ++heap_counter : 0), is_split(true) { }
120+
HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool* Pool = nullptr)
121+
: heap(Heap),
122+
size({.total = Size, .available = Size}),
123+
pool(Pool),
124+
heap_id(Heap ? ++heap_counter : 0),
125+
is_split(true) {}
117126

118127
static MTLResourceOptions getOptions(uint32_t usage) {
119128
// TODO: check the caching performance of write-combined mode
@@ -126,16 +135,17 @@ struct HeapBlock {
126135
else
127136
options |= MTLResourceStorageModePrivate;
128137

129-
options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
138+
options |=
139+
(usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
130140

131141
return options;
132142
}
133143

134144
static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
135-
HeapBlock *heapBlock = nullptr;
145+
HeapBlock* heapBlock = nullptr;
136146
bool is_split = true;
137147
const size_t size = params.size();
138-
MTLHeapDescriptor *d = [MTLHeapDescriptor new];
148+
MTLHeapDescriptor* d = [MTLHeapDescriptor new];
139149
if (d) {
140150
const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
141151
if (size <= kMaxSmallAlloc) {
@@ -152,10 +162,11 @@ struct HeapBlock {
152162
d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
153163
// this automatically handles Metal buffer access synchronizations at the
154164
// cost of slightly lower performance.
155-
d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
165+
d.hazardTrackingMode =
166+
(usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
156167
d.resourceOptions = getOptions(usage);
157168
d.type = MTLHeapTypeAutomatic;
158-
id<MTLHeap> heap = [device newHeapWithDescriptor: d];
169+
id<MTLHeap> heap = [device newHeapWithDescriptor:d];
159170
if (heap) {
160171
[heap setPurgeableState:MTLPurgeableStateNonVolatile];
161172
const size_t heap_size = heapAvailableSize(heap);
@@ -169,8 +180,8 @@ struct HeapBlock {
169180
return heapBlock;
170181
}
171182
static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
172-
return (a->size.available != b->size.available) ? a->size.available < b->size.available :
173-
(uintptr_t)a->heap < (uintptr_t)b->heap;
183+
return (a->size.available != b->size.available) ? a->size.available < b->size.available
184+
: (uintptr_t)a->heap < (uintptr_t)b->heap;
174185
}
175186
static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
176187
return [heap maxAvailableSizeWithAlignment:Alignment];
@@ -205,8 +216,12 @@ struct HeapBlock {
205216
size.available = 0;
206217
return retainCount;
207218
}
208-
uint32_t retainCount() const { return [heap retainCount]; }
209-
void updateAvailableSize() { size.available = heapAvailableSize(heap); }
219+
uint32_t retainCount() const {
220+
return [heap retainCount];
221+
}
222+
void updateAvailableSize() {
223+
size.available = heapAvailableSize(heap);
224+
}
210225
};
211226
typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
212227

@@ -219,9 +234,8 @@ struct BufferPool {
219234
SCALAR,
220235
};
221236

222-
BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
223-
device(Device), usage(Usage),
224-
heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
237+
BufferPool(const id<MTLDevice> Device, uint32_t Usage)
238+
: device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) {}
225239

226240
const id<MTLDevice> device;
227241
// usage flags to customize the pool for various purposes (see UsageFlags enum)
@@ -248,12 +262,12 @@ struct BufferPool {
248262
};
249263

250264
class MPSHeapAllocatorImpl {
251-
public:
252-
explicit MPSHeapAllocatorImpl() :
253-
m_device(at::mps::MPSDevice::getInstance()->device()),
254-
m_max_buffer_size([m_device maxBufferLength]),
255-
m_stream(getDefaultMPSStream()),
256-
m_event_pool(getMPSEventPool()) {
265+
public:
266+
explicit MPSHeapAllocatorImpl()
267+
: m_device(at::mps::MPSDevice::getInstance()->device()),
268+
m_max_buffer_size([m_device maxBufferLength]),
269+
m_stream(getDefaultMPSStream()),
270+
m_event_pool(getMPSEventPool()) {
257271
init_allocator();
258272
}
259273
~MPSHeapAllocatorImpl() {
@@ -298,34 +312,50 @@ class MPSHeapAllocatorImpl {
298312
// (see m_high_watermark_ratio for description)
299313
void setHighWatermarkRatio(double ratio);
300314
// (see m_low_watermark_limit for description)
301-
size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
315+
size_t getLowWatermarkLimit() const {
316+
return m_low_watermark_limit;
317+
}
302318
// (see m_max_total_allowed_size for description)
303-
size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
319+
size_t getHighWatermarkLimit() const {
320+
return m_max_total_allowed_size;
321+
}
304322
// (see m_total_allocated_memory for description)
305-
size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
323+
size_t getTotalAllocatedMemory() const {
324+
return m_total_allocated_memory;
325+
}
306326
// (see m_current_allocated_memory for description)
307-
size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
327+
size_t getCurrentAllocatedMemory() const {
328+
return m_current_allocated_memory;
329+
}
308330
// total GPU memory allocated in the process by Metal driver; including
309331
// implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
310-
size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
332+
size_t getDriverAllocatedMemory() const {
333+
return current_allocated_size();
334+
}
311335
// recommended Max memory for Metal
312-
size_t getRecommendedMaxMemory() const { return max_device_size(); }
336+
size_t getRecommendedMaxMemory() const {
337+
return max_device_size();
338+
}
313339
// (see enum DebugVerbosity for description)
314-
uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
340+
uint32_t getDebugVerbosity() const {
341+
return m_debug_verbosity;
342+
}
315343
// returns the device that we allocate from
316-
inline id<MTLDevice> Device() const { return m_device; }
344+
inline id<MTLDevice> Device() const {
345+
return m_device;
346+
}
317347

318348
// TODO: make a common function to do size unit conversions in PyTorch.
319349
inline std::string format_size(uint64_t size) const;
320350

321-
private:
351+
private:
322352
// (see m_high_watermark_ratio for description)
323353
constexpr static double default_high_watermark_ratio = 1.7;
324354
// we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
325355
constexpr static double default_high_watermark_upper_bound = 2.0;
326356
// (see m_low_watermark_ratio for description)
327357
// on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
328-
constexpr static double default_low_watermark_ratio_unified = 1.4;
358+
constexpr static double default_low_watermark_ratio_unified = 1.4;
329359
constexpr static double default_low_watermark_ratio_discrete = 1.0;
330360

331361
const id<MTLDevice> m_device;
@@ -387,14 +417,19 @@ class MPSHeapAllocatorImpl {
387417
size_t get_allocation_size(size_t size, uint32_t usage) const;
388418
// maximum size of device memory available for allocation in current process
389419
// Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
390-
size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
420+
size_t max_device_size() const {
421+
return [m_device recommendedMaxWorkingSetSize];
422+
}
391423
// there are implicit allocations from MPS backend, so we need to query the 'device' for
392424
// total allocated size instead of manually tracking in MPSAllocator
393-
size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
425+
size_t current_allocated_size() const {
426+
return [m_device currentAllocatedSize];
427+
}
394428

395429
bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
396430
for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
397-
MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
431+
MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(
432+
buffer_block ? buffer_block->buffer : nullptr, event);
398433
}
399434
return true;
400435
}

0 commit comments

Comments
 (0)