From 4bc3a799556880ecc83d181b1906e8b698e21c69 Mon Sep 17 00:00:00 2001 From: Adriankhl Date: Sat, 8 Jun 2024 21:05:17 +0800 Subject: [PATCH] vulkan: select only one device for single gpu with multiple drivers --- ggml-vulkan.cpp | 79 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 128769177f1021..238d5ec78eb1c4 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1,5 +1,4 @@ #include "ggml-vulkan.h" - #ifdef GGML_VULKAN_RUN_TESTS #include #endif @@ -9,12 +8,13 @@ #include #include #include -#include #include #include #include #include #include +#include +#include #include "ggml.h" #include "ggml-backend-impl.h" @@ -1566,8 +1566,10 @@ static void ggml_vk_print_gpu_info(size_t idx) { vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceDriverProperties driver_props; props2.pNext = &props3; props3.pNext = &subgroup_props; + subgroup_props.pNext = &driver_props; physical_device.getProperties2(&props2); const size_t subgroup_size = subgroup_props.subgroupSize; @@ -1611,7 +1613,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { fp16 = fp16 && vk12_features.shaderFloat16; std::string device_name = props2.properties.deviceName.data(); - std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl; + std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl; if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl; @@ -1707,7 +1709,76 @@ void ggml_vk_instance_init() { vk::PhysicalDeviceProperties props = devices[i].getProperties(); if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { - vk_instance.device_indices.push_back(i); + // Check if there are two physical devices corresponding to the same GPU + auto old_device = std::find_if( + vk_instance.device_indices.begin(), + vk_instance.device_indices.end(), + [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; } + ); + if (old_device == vk_instance.device_indices.end()) { + vk_instance.device_indices.push_back(i); + } else { + // There can be two physical devices corresponding to the same GPU if there are 2 different drivers + // This can cause error when splitting layers aross the devices, need to keep only 1 +#ifdef GGML_VULKAN_DEBUG + std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl; +#endif + + vk::PhysicalDeviceProperties2 old_prop; + vk::PhysicalDeviceDriverProperties old_driver; + old_prop.pNext = &old_driver; + devices[*old_device].getProperties2(&old_prop); + + vk::PhysicalDeviceProperties2 new_prop; + vk::PhysicalDeviceDriverProperties new_driver; + new_prop.pNext = &new_driver; + devices[i].getProperties2(&new_prop); + + std::map driver_priorities {}; + int old_priority = std::numeric_limits::max(); + int new_priority = std::numeric_limits::max(); + + // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id + // Smaller number -> higher priority + switch (old_prop.properties.vendorID) { + case VK_VENDOR_ID_AMD: + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_MESA_RADV)] = 1; + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_AMD_OPEN_SOURCE)] = 2; + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_AMD_PROPRIETARY)] = 3; + break; + case VK_VENDOR_ID_INTEL: + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA)] = 1; + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS)] = 2; + break; + case VK_VENDOR_ID_NVIDIA: + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_NVIDIA_PROPRIETARY)] = 1; + driver_priorities[static_cast(VkDriverId::VK_DRIVER_ID_MESA_NVK)] = 2; + break; + } + + if (driver_priorities.count(old_driver.driverID)) { + old_priority = driver_priorities[old_driver.driverID]; + } + if (driver_priorities.count(new_driver.driverID)) { + new_priority = driver_priorities[new_driver.driverID]; + } + + if (new_priority < old_priority) { + auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device); + vk_instance.device_indices.erase(r, vk_instance.device_indices.end()); + vk_instance.device_indices.push_back(i); + +#ifdef GGML_VULKAN_DEBUG + std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl; +#endif + } +#ifdef GGML_VULKAN_DEBUG + else { + std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl; + + } +#endif + } } }