1919// max number of MTLCommandBuffer used to submit a graph for processing
2020#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121
22+ #define GGML_METAL_MAX_RESIDENCY_SETS 128
23+
2224#define UNUSED (x ) (void )(x)
2325
2426// globals
3739 id <MTLDevice > mtl_device;
3840 int mtl_device_ref_count;
3941
42+ id <MTLResidencySet> mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS];
43+ int mtl_residency_set_n;
44+
4045 bool has_simdgroup_reduction;
4146 bool has_simdgroup_mm;
4247 bool has_bfloat;
4651} g_ggml_ctx_dev_main = {
4752 /* .mtl_device =*/ nil ,
4853 /* .mtl_device_ref_count =*/ 0 ,
54+ /* .mtl_residency_set =*/ { nil },
55+ /* .mtl_residency_set_n =*/ 0 ,
4956 /* .has_simdgroup_reduction =*/ false ,
5057 /* .has_simdgroup_mm =*/ false ,
5158 /* .has_bfloat =*/ false ,
@@ -95,6 +102,42 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
95102 }
96103}
97104
105+ // add command queue
106+ static bool ggml_backend_metal_device_add_residency_set (struct ggml_backend_metal_device_context * ctx, id <MTLResidencySet> residency_set) {
107+ assert (ctx != NULL );
108+ assert (queue != nil );
109+
110+ if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) {
111+ GGML_LOG_ERROR (" %s : warning: maximum number of residency sets reached\n " , __func__);
112+ return false ;
113+ }
114+
115+ ctx->mtl_residency_set [ctx->mtl_residency_set_n++] = residency_set;
116+
117+ return true ;
118+ }
119+
120+ // remove residency set
121+ // search for the residency set in the list, remove it and shift the remaining residency sets
122+ static bool ggml_backend_metal_device_remove_residency_set (struct ggml_backend_metal_device_context * ctx, id <MTLResidencySet> residency_set) {
123+ assert (ctx != NULL );
124+ assert (residency_set != nil );
125+
126+ for (int i = 0 ; i < ctx->mtl_residency_set_n ; ++i) {
127+ if (ctx->mtl_residency_set [i] == residency_set) {
128+ for (int j = i; j < ctx->mtl_residency_set_n - 1 ; ++j) {
129+ ctx->mtl_residency_set [j] = ctx->mtl_residency_set [j + 1 ];
130+ }
131+
132+ ctx->mtl_residency_set_n --;
133+
134+ return true ;
135+ }
136+ }
137+
138+ return false ;
139+ }
140+
98141// kernels
99142
100143struct ggml_metal_kernel {
@@ -483,6 +526,11 @@ @implementation GGMLMetalClass
483526 GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484527
485528 ctx->queue = [device newCommandQueue ];
529+ if (ctx->queue == nil ) {
530+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
531+ return NULL ;
532+ }
533+
486534 ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487535
488536 id <MTLLibrary > metal_library;
@@ -1035,6 +1083,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351083 // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361084 int n_buffers;
10371085 struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1086+
1087+ id <MTLResidencySet> residency_set;
10381088};
10391089
10401090// finds the Metal buffer that contains the tensor data on the GPU device
@@ -4039,6 +4089,20 @@ static enum ggml_status ggml_metal_graph_compute(
40394089 struct ggml_backend_metal_context * ctx = backend->context ;
40404090 struct ggml_backend_metal_device_context * ctx_dev = backend->device ->context ;
40414091
4092+ static bool is_first = true ;
4093+ if (is_first) {
4094+ is_first = false ;
4095+ GGML_LOG_INFO (" %s : adding %d residency sets\n " , __func__, ctx_dev->mtl_residency_set_n );
4096+ [ctx->queue addResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4097+ }
4098+
4099+ // for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) {
4100+ // GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]);
4101+ // [ctx_dev->mtl_residency_set[i] requestResidency];
4102+ // }
4103+
4104+ int64_t t_start_us = ggml_time_us ();
4105+
40424106 // number of nodes encoded by the main thread (empirically determined)
40434107 const int n_main = 128 ;
40444108
@@ -4086,19 +4150,23 @@ static enum ggml_status ggml_metal_graph_compute(
40864150 // the main thread commits the first few commands immediately
40874151 // command_buffer[n_cb]
40884152 {
4089- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4153+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
40904154 ctx->command_buffers [n_cb] = command_buffer;
40914155
4156+ [command_buffer useResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4157+
40924158 [command_buffer enqueue ];
40934159 ctx->encode_async (n_cb);
40944160 }
40954161
40964162 // prepare the rest of the command buffers asynchronously
40974163 // command_buffer[0.. n_cb)
40984164 for (int cb_idx = 0 ; cb_idx < n_cb; ++cb_idx) {
4099- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4165+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
41004166 ctx->command_buffers [cb_idx] = command_buffer;
41014167
4168+ [command_buffer useResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4169+
41024170 // always enqueue the first two command buffers
41034171 // enqueue all of the command buffers if we don't need to abort
41044172 if (cb_idx < 2 || ctx->abort_callback == NULL ) {
@@ -4163,6 +4231,10 @@ static enum ggml_status ggml_metal_graph_compute(
41634231 }
41644232 }
41654233
4234+ int64_t t_end_us = ggml_time_us ();
4235+
4236+ GGML_LOG_DEBUG (" %s : compute graph took %8.2f ms\n " , __func__, (t_end_us - t_start_us) / 1000.0 );
4237+
41664238 return GGML_STATUS_SUCCESS;
41674239}
41684240
@@ -4176,6 +4248,13 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41764248 for (int i = 0 ; i < ctx->n_buffers ; i++) {
41774249 [ctx->buffers[i].metal release ];
41784250 }
4251+
4252+ ggml_backend_metal_device_remove_residency_set (buffer->buft ->device ->context , ctx->residency_set );
4253+
4254+ [ctx->residency_set endResidency ];
4255+ [ctx->residency_set removeAllAllocations ];
4256+ [ctx->residency_set release ];
4257+
41794258 ggml_backend_metal_device_rel (buffer->buft ->device ->context );
41804259
41814260 if (ctx->owned ) {
@@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42844363 size_aligned += (size_page - (size_aligned % size_page));
42854364 }
42864365
4287- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4366+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4367+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
42884368
42894369 ctx->all_data = ggml_metal_host_malloc (size_aligned);
42904370 ctx->all_size = size_aligned;
@@ -4307,10 +4387,34 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43074387 if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
43084388 GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
43094389 free (ctx);
4310- ggml_backend_metal_device_rel (buft-> device -> context );
4390+ ggml_backend_metal_device_rel (ctx_dev );
43114391 return NULL ;
43124392 }
43134393
4394+ {
4395+ MTLResidencySetDescriptor * desc;
4396+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4397+ desc.label = @" Primary residency set" ;
4398+ desc.initialCapacity = ctx->n_buffers ;
4399+
4400+ NSError *error;
4401+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4402+ if (error) {
4403+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4404+ return NULL ;
4405+ }
4406+
4407+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4408+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4409+ }
4410+
4411+ [ctx->residency_set commit ];
4412+ [ctx->residency_set requestResidency ];
4413+
4414+ // track the residency set in the device context
4415+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4416+ }
4417+
43144418 // ggml_backend_metal_log_allocated_size(device, size_aligned);
43154419
43164420 return ggml_backend_buffer_init (buft, ggml_backend_metal_buffer_i, ctx, size);
@@ -4400,7 +4504,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44004504 size_aligned += (size_page - (size_aligned % size_page));
44014505 }
44024506
4403- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4507+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4508+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
44044509
44054510 // the buffer fits into the max buffer size allowed by the device
44064511 if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4558,30 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44534558 }
44544559 }
44554560
4561+ {
4562+ MTLResidencySetDescriptor * desc;
4563+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4564+ desc.label = @" Primary residency set" ;
4565+ desc.initialCapacity = ctx->n_buffers ;
4566+
4567+ NSError *error;
4568+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4569+ if (error) {
4570+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4571+ return NULL ;
4572+ }
4573+
4574+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4575+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4576+ }
4577+
4578+ [ctx->residency_set commit ];
4579+ [ctx->residency_set requestResidency ];
4580+
4581+ // track the residency set in the device context
4582+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4583+ }
4584+
44564585 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
44574586}
44584587
@@ -4766,6 +4895,30 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47664895 }
47674896 }
47684897
4898+ {
4899+ MTLResidencySetDescriptor * desc;
4900+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4901+ desc.label = @" Primary residency set" ;
4902+ desc.initialCapacity = ctx->n_buffers ;
4903+
4904+ NSError *error;
4905+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4906+ if (error) {
4907+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4908+ return NULL ;
4909+ }
4910+
4911+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4912+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4913+ }
4914+
4915+ [ctx->residency_set commit ];
4916+ [ctx->residency_set requestResidency ];
4917+
4918+ // track the residency set in the device context
4919+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4920+ }
4921+
47694922 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
47704923}
47714924
0 commit comments