Skip to content

Commit 7bd1195

Browse files
committed
AMD: parse the architecture as supplied by gcnArchName
The value provided by minor is truncated for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID. We can also use the common value for GCN4, as gfx800, to avoid missing compatible devices.
1 parent b4d92a5 commit 7bd1195

File tree

2 files changed

+29
-12
lines changed

2 files changed

+29
-12
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,20 @@
4646
#define GGML_CUDA_CC_VOLTA 700
4747
#define GGML_CUDA_CC_TURING 750
4848
#define GGML_CUDA_CC_AMPERE 800
49-
#define GGML_CUDA_CC_OFFSET_AMD 1000000
49+
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
5050

5151
// GCN/CNDA, wave size is 64
52-
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53-
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54-
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55-
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56-
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57-
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
52+
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x800) // Tonga, Fiji, Polaris, minimum for fast fp16
53+
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
54+
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
55+
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
56+
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
57+
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
5858

5959
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60-
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
61-
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62-
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
60+
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
61+
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
62+
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
6363

6464
#define GGML_CUDA_CC_QY1 210
6565
#define GGML_CUDA_CC_QY2 220

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
169169

170170
cudaDeviceProp prop;
171171
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
172-
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
172+
GGML_LOG_INFO(" Device %d: %s", id, prop.name);
173173

174174
info.default_tensor_split[id] = total_vram;
175175
total_vram += prop.totalGlobalMem;
@@ -178,11 +178,28 @@ static ggml_cuda_device_info ggml_cuda_init() {
178178
info.devices[id].smpb = prop.sharedMemPerBlock;
179179
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
180180
info.devices[id].smpbo = prop.sharedMemPerBlock;
181-
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
181+
// Device architectures are returned as gfxMmm with M the major as an integer and mm minor as hexadecimal
182+
// we can treat it all as hexadecimal for simplicity
183+
int archLen = strlen(prop.gcnArchName);
184+
char archName[archLen + 1];
185+
strcpy(archName, prop.gcnArchName);
186+
int archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
187+
archName[archLen - 2] = '\0';
188+
int archMajor = (int)strtoul(&archName[3], 0, 16);
189+
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + archMajor * 0x100;
190+
if (archMajor != 8) {
191+
info.devices[id].cc = info.devices[id].cc + archMinor;
192+
}
193+
GGML_LOG_INFO(", arch gfx%x%02x (0x%x)", archMajor, archMinor, info.devices[id].cc ^ GGML_CUDA_CC_OFFSET_AMD);
182194
#else
183195
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
184196
info.devices[id].cc = 100*prop.major + 10*prop.minor;
197+
GGML_LOG_INFO(", compute capability %d.%d", prop.major, prop.minor);
185198
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
199+
GGML_LOG_INFO(", VMM: %s\n", device_vmm ? "yes" : "no");
200+
if (prop.major < 1) {
201+
GGML_LOG_WARN("Invalid compute version returned for device %d %s: %d\n", id, prop.name, prop.major);
202+
}
186203
}
187204

188205
for (int id = 0; id < info.device_count; ++id) {

0 commit comments

Comments
 (0)