@@ -6564,18 +6564,16 @@ struct scoped_spin_lock {
65646564
65656565static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
65666566
6567- #if 0
6568- #define DEBUG_CUDA_MALLOC
6567+ // #define DEBUG_CUDA_MALLOC
65696568struct cuda_buffer {
65706569 void * ptr = nullptr ;
65716570 size_t size = 0 ;
65726571};
65736572
65746573static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6575-
65766574static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
65776575
6578- static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
6576+ static void * ggml_cuda_pool_malloc_leg (size_t size, size_t * actual_size) {
65796577 scoped_spin_lock lock (g_cuda_pool_lock);
65806578 int id;
65816579 CUDA_CHECK (cudaGetDevice (&id));
@@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
66296627 return ptr;
66306628}
66316629
6632- static void ggml_cuda_pool_free (void * ptr, size_t size) {
6630+ static void ggml_cuda_pool_free_leg (void * ptr, size_t size) {
66336631 scoped_spin_lock lock (g_cuda_pool_lock);
66346632 int id;
66356633 CUDA_CHECK (cudaGetDevice (&id));
@@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
66466644 CUDA_CHECK (cudaFree (ptr));
66476645 g_cuda_pool_size[id] -= size;
66486646}
6649- #else
66506647
6648+ #if !defined(GGML_USE_HIPBLAS)
6649+ // pool with virtual memory
66516650static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
66526651static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0 };
6653- static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
66546652static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0 };
6655-
66566653static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36 ; // 64 GB
66576654
6658- // #define DEBUG_CUDA_MALLOC
6659-
6660- #define ggml_cuda_pool_malloc (size, actual_size ) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
6661- static void * ggml_cuda_pool_malloc_ (size_t size, size_t * actual_size, const char * call) {
6655+ static void * ggml_cuda_pool_malloc_vmm (size_t size, size_t * actual_size) {
66626656 scoped_spin_lock lock (g_cuda_pool_lock);
66636657 int id;
66646658 CUDA_CHECK (cudaGetDevice (&id));
@@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
67056699 g_cuda_pool_handles[id].push_back (handle);
67066700 g_cuda_pool_size[id] += reserve_size;
67076701
6708- printf (" cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s] \n " ,
6709- id, (unsigned long long ) (g_cuda_pool_size[id]/1024 /1024 ),
6710- (unsigned long long ) (reserve_size/1024 /1024 ), call );
6702+ // printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
6703+ // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6704+ // (unsigned long long) (reserve_size/1024/1024));
67116705 }
67126706
67136707 GGML_ASSERT (g_cuda_pool_addr[id] != 0 );
@@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
67176711 g_cuda_pool_used[id] += size;
67186712
67196713#ifdef DEBUG_CUDA_MALLOC
6720- printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " , id, (unsigned long long ) size, ptr, call );
6714+ printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " , id, (unsigned long long ) size, ptr);
67216715#endif
67226716
67236717 return ptr;
6724-
6725- GGML_UNUSED (call);
67266718}
67276719
6728- #define ggml_cuda_pool_free (ptr, size ) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
6729- static void ggml_cuda_pool_free_ (void * ptr, size_t size, const char * call) {
6720+ static void ggml_cuda_pool_free_vmm (void * ptr, size_t size) {
67306721 scoped_spin_lock lock (g_cuda_pool_lock);
67316722 int id;
67326723 CUDA_CHECK (cudaGetDevice (&id));
67336724
67346725#ifdef DEBUG_CUDA_MALLOC
6735- printf (" cuda pool[%d]: free %llu bytes at %llx [%s] \n " , id, (unsigned long long ) size, ptr, call );
6726+ printf (" cuda pool[%d]: freed %llu bytes at %llx\n " , id, (unsigned long long ) size, ptr);
67366727#endif
67376728
67386729 g_cuda_pool_used[id] -= size;
67396730
67406731 // all deallocations must be in reverse order of the allocations
67416732 GGML_ASSERT (ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6733+ }
67426734
6743- GGML_UNUSED (call);
6735+ static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false };
6736+
6737+ static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
6738+ int id;
6739+ CUDA_CHECK (cudaGetDevice (&id));
6740+ if (g_device_vmm[id]) {
6741+ return ggml_cuda_pool_malloc_vmm (size, actual_size);
6742+ } else {
6743+ return ggml_cuda_pool_malloc_leg (size, actual_size);
6744+ }
67446745}
67456746
6747+ static void ggml_cuda_pool_free (void * ptr, size_t size) {
6748+ int id;
6749+ CUDA_CHECK (cudaGetDevice (&id));
6750+ if (g_device_vmm[id]) {
6751+ ggml_cuda_pool_free_vmm (ptr, size);
6752+ } else {
6753+ ggml_cuda_pool_free_leg (ptr, size);
6754+ }
6755+ }
6756+ #else
6757+ #define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
6758+ #define ggml_cuda_pool_free ggml_cuda_pool_free_leg
67466759#endif
67476760
67486761static bool g_cublas_loaded = false ;
@@ -6783,9 +6796,18 @@ void ggml_init_cublas() {
67836796#endif
67846797 fprintf (stderr, " %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, g_device_count);
67856798 for (int id = 0 ; id < g_device_count; ++id) {
6799+ int deviceSupportsVmm = 0 ;
6800+
6801+ #if !defined(GGML_USE_HIPBLAS)
6802+ CUdevice device;
6803+ CU_CHECK (cuDeviceGet (&device, id));
6804+ CU_CHECK (cuDeviceGetAttribute (&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6805+ g_device_vmm[id] = !!deviceSupportsVmm;
6806+ #endif
6807+
67866808 cudaDeviceProp prop;
67876809 CUDA_CHECK (cudaGetDeviceProperties (&prop, id));
6788- fprintf (stderr, " Device %d: %s, compute capability %d.%d\n " , id, prop.name , prop.major , prop.minor );
6810+ fprintf (stderr, " Device %d: %s, compute capability %d.%d, VMM: %s \n " , id, prop.name , prop.major , prop.minor , deviceSupportsVmm ? " yes " : " no " );
67896811
67906812 g_tensor_split[id] = total_vram;
67916813 total_vram += prop.totalGlobalMem ;
0 commit comments