Skip to content

Commit

Permalink
AMD: parse the architecture as supplied by gcnArchName
Browse files Browse the repository at this point in the history
The value provided by minor is truncated for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID.

We can also use the common value for GCN4, as gfx800, to avoid missing compatible devices.
  • Loading branch information
Haus1 committed Jan 14, 2025
1 parent b4d92a5 commit 7bd1195
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
20 changes: 10 additions & 10 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,20 @@
#define GGML_CUDA_CC_VOLTA 700
#define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_OFFSET_AMD 1000000
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000

// GCN/CNDA, wave size is 64
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x800) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300

// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA

#define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220
Expand Down
21 changes: 19 additions & 2 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {

cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s", id, prop.name);

info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
Expand All @@ -178,11 +178,28 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
// Device architectures are returned as gfxMmm with M the major as an integer and mm minor as hexadecimal
// we can treat it all as hexadecimal for simplicity
int archLen = strlen(prop.gcnArchName);
char archName[archLen + 1];
strcpy(archName, prop.gcnArchName);
int archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
archName[archLen - 2] = '\0';
int archMajor = (int)strtoul(&archName[3], 0, 16);
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + archMajor * 0x100;
if (archMajor != 8) {
info.devices[id].cc = info.devices[id].cc + archMinor;
}
GGML_LOG_INFO(", arch gfx%x%02x (0x%x)", archMajor, archMinor, info.devices[id].cc ^ GGML_CUDA_CC_OFFSET_AMD);
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(", compute capability %d.%d", prop.major, prop.minor);
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
GGML_LOG_INFO(", VMM: %s\n", device_vmm ? "yes" : "no");
if (prop.major < 1) {
GGML_LOG_WARN("Invalid compute version returned for device %d %s: %d\n", id, prop.name, prop.major);
}
}

for (int id = 0; id < info.device_count; ++id) {
Expand Down

0 comments on commit 7bd1195

Please sign in to comment.