Skip to content

Commit b791f7f

Browse files
committed
vulkan: select only one device for single gpu with multiple drivers
1 parent 7a16ce7 commit b791f7f

File tree

1 file changed

+80
-4
lines changed

1 file changed

+80
-4
lines changed

ggml-vulkan.cpp

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#include "ggml-vulkan.h"
2-
2+
#include <vulkan/vulkan_core.h>
33
#ifdef GGML_VULKAN_RUN_TESTS
44
#include <chrono>
55
#endif
@@ -9,12 +9,13 @@
99
#include <algorithm>
1010
#include <cmath>
1111
#include <iostream>
12-
#include <limits>
1312
#include <tuple>
1413
#include <vector>
1514
#include <sstream>
1615
#include <utility>
1716
#include <memory>
17+
#include <limits>
18+
#include <map>
1819

1920
#include "ggml.h"
2021
#include "ggml-backend-impl.h"
@@ -1566,8 +1567,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
15661567
vk::PhysicalDeviceProperties2 props2;
15671568
vk::PhysicalDeviceMaintenance3Properties props3;
15681569
vk::PhysicalDeviceSubgroupProperties subgroup_props;
1570+
vk::PhysicalDeviceDriverProperties driver_props;
15691571
props2.pNext = &props3;
15701572
props3.pNext = &subgroup_props;
1573+
subgroup_props.pNext = &driver_props;
15711574
physical_device.getProperties2(&props2);
15721575

15731576
const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1611,7 +1614,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
16111614
fp16 = fp16 && vk12_features.shaderFloat16;
16121615

16131616
std::string device_name = props2.properties.deviceName.data();
1614-
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1617+
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
16151618

16161619
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
16171620
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1707,7 +1710,80 @@ void ggml_vk_instance_init() {
17071710
vk::PhysicalDeviceProperties props = devices[i].getProperties();
17081711

17091712
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1710-
vk_instance.device_indices.push_back(i);
1713+
// Check if there are two physical devices corresponding to the same GPU
1714+
auto old_device = std::find_if(
1715+
vk_instance.device_indices.begin(),
1716+
vk_instance.device_indices.end(),
1717+
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1718+
);
1719+
if (old_device == vk_instance.device_indices.end()) {
1720+
vk_instance.device_indices.push_back(i);
1721+
} else {
1722+
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1723+
// This can cause error when splitting layers aross the devices, need to keep only 1
1724+
#ifdef GGML_VULKAN_DEBUG
1725+
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
1726+
#endif
1727+
1728+
vk::PhysicalDeviceProperties2 old_prop;
1729+
vk::PhysicalDeviceDriverProperties old_driver;
1730+
old_prop.pNext = &old_driver;
1731+
devices[*old_device].getProperties2(&old_prop);
1732+
1733+
vk::PhysicalDeviceProperties2 new_prop;
1734+
vk::PhysicalDeviceDriverProperties new_driver;
1735+
new_prop.pNext = &new_driver;
1736+
devices[i].getProperties2(&new_prop);
1737+
1738+
std::map<vk::DriverId, int> driver_priorities {};
1739+
int old_priority = std::numeric_limits<int>::max();
1740+
int new_priority = std::numeric_limits<int>::max();
1741+
1742+
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1743+
// Smaller number -> higher priority
1744+
switch (old_prop.properties.vendorID) {
1745+
case VK_VENDOR_ID_AMD:
1746+
driver_priorities[vk::DriverId::eMesaRadv] = 1;
1747+
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1748+
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1749+
break;
1750+
case VK_VENDOR_ID_INTEL:
1751+
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1752+
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1753+
break;
1754+
case VK_VENDOR_ID_NVIDIA:
1755+
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1756+
1757+
VK_API_VERSION_MAJOR(VK_API_VERSION_1_3);
1758+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1759+
driver_priorities[vk::DriverId::eMesaNvk] = 2;
1760+
#endif
1761+
break;
1762+
}
1763+
1764+
if (driver_priorities.count(old_driver.driverID)) {
1765+
old_priority = driver_priorities[old_driver.driverID];
1766+
}
1767+
if (driver_priorities.count(new_driver.driverID)) {
1768+
new_priority = driver_priorities[new_driver.driverID];
1769+
}
1770+
1771+
if (new_priority < old_priority) {
1772+
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1773+
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1774+
vk_instance.device_indices.push_back(i);
1775+
1776+
#ifdef GGML_VULKAN_DEBUG
1777+
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
1778+
#endif
1779+
}
1780+
#ifdef GGML_VULKAN_DEBUG
1781+
else {
1782+
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
1783+
1784+
}
1785+
#endif
1786+
}
17111787
}
17121788
}
17131789

0 commit comments

Comments
 (0)