1919// max number of MTLCommandBuffer used to submit a graph for processing
2020#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121
22+ // create residency sets only on macOS >= 15.0
23+ #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
25+ #endif
26+
2227#define UNUSED (x ) (void )(x)
2328
2429// globals
3944
4045 bool has_simdgroup_reduction;
4146 bool has_simdgroup_mm;
47+ bool has_residency_sets;
4248 bool has_bfloat;
4349 bool use_bfloat;
4450
4854 /* .mtl_device_ref_count =*/ 0 ,
4955 /* .has_simdgroup_reduction =*/ false ,
5056 /* .has_simdgroup_mm =*/ false ,
57+ /* .has_residency_sets =*/ false ,
5158 /* .has_bfloat =*/ false ,
5259 /* .use_bfloat =*/ false ,
5360 /* .name =*/ " " ,
6572
6673 ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily: MTLGPUFamilyApple7];
6774
75+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
76+ ctx->has_residency_sets = true ;
77+ #endif
78+
6879 ctx->has_bfloat = [ctx->mtl_device supportsFamily: MTLGPUFamilyMetal3_GGML];
6980 ctx->has_bfloat |= [ctx->mtl_device supportsFamily: MTLGPUFamilyApple6];
7081
@@ -483,6 +494,11 @@ @implementation GGMLMetalClass
483494 GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484495
485496 ctx->queue = [device newCommandQueue ];
497+ if (ctx->queue == nil ) {
498+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
499+ return NULL ;
500+ }
501+
486502 ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487503
488504 id <MTLLibrary > metal_library;
@@ -649,6 +665,7 @@ @implementation GGMLMetalClass
649665
650666 GGML_LOG_INFO (" %s : simdgroup reduction = %s \n " , __func__, ctx_dev->has_simdgroup_reduction ? " true" : " false" );
651667 GGML_LOG_INFO (" %s : simdgroup matrix mul. = %s \n " , __func__, ctx_dev->has_simdgroup_mm ? " true" : " false" );
668+ GGML_LOG_INFO (" %s : has residency sets = %s \n " , __func__, ctx_dev->has_residency_sets ? " true" : " false" );
652669 GGML_LOG_INFO (" %s : has bfloat = %s \n " , __func__, ctx_dev->has_bfloat ? " true" : " false" );
653670 GGML_LOG_INFO (" %s : use bfloat = %s \n " , __func__, ctx_dev->use_bfloat ? " true" : " false" );
654671 GGML_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx_dev->mtl_device .hasUnifiedMemory ? " true" : " false" );
@@ -1035,8 +1052,60 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351052 // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361053 int n_buffers;
10371054 struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1055+
1056+ // optional MTLResidencySet
1057+ id rset;
10381058};
10391059
1060+ // rset init
1061+ static bool ggml_backend_metal_buffer_rset_init (struct ggml_backend_metal_buffer_context * ctx, id <MTLDevice > device) {
1062+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1063+ if (@available (macOS 15.0 , *)) {
1064+ MTLResidencySetDescriptor * desc;
1065+ desc = [[MTLResidencySetDescriptor alloc ] init ];
1066+ desc.label = @" ggml_backend_metal" ;
1067+ desc.initialCapacity = ctx->n_buffers ;
1068+
1069+ NSError * error;
1070+ ctx->rset = [device newResidencySetWithDescriptor: desc error: &error];
1071+ if (error) {
1072+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
1073+ return false ;
1074+ }
1075+
1076+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
1077+ [ctx->rset addAllocation: ctx->buffers[i].metal];
1078+ }
1079+
1080+ [ctx->rset commit ];
1081+ [ctx->rset requestResidency ];
1082+
1083+ return true ;
1084+ }
1085+ #else
1086+ GGML_UNUSED (device);
1087+ #endif
1088+
1089+ ctx->rset = nil ;
1090+
1091+ return true ;
1092+ }
1093+
1094+ // rset free
1095+ static void ggml_backend_metal_buffer_rset_free (struct ggml_backend_metal_buffer_context * ctx) {
1096+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1097+ if (@available (macOS 15.0 , *)) {
1098+ if (ctx->rset ) {
1099+ [ctx->rset endResidency ];
1100+ [ctx->rset removeAllAllocations ];
1101+ [ctx->rset release ];
1102+ }
1103+ }
1104+ #else
1105+ GGML_UNUSED (ctx);
1106+ #endif
1107+ }
1108+
10401109// finds the Metal buffer that contains the tensor data on the GPU device
10411110// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
10421111// Metal buffer based on the host memory pointer
@@ -4086,7 +4155,7 @@ static enum ggml_status ggml_metal_graph_compute(
40864155 // the main thread commits the first few commands immediately
40874156 // command_buffer[n_cb]
40884157 {
4089- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4158+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
40904159 ctx->command_buffers [n_cb] = command_buffer;
40914160
40924161 [command_buffer enqueue ];
@@ -4096,7 +4165,7 @@ static enum ggml_status ggml_metal_graph_compute(
40964165 // prepare the rest of the command buffers asynchronously
40974166 // command_buffer[0.. n_cb)
40984167 for (int cb_idx = 0 ; cb_idx < n_cb; ++cb_idx) {
4099- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4168+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
41004169 ctx->command_buffers [cb_idx] = command_buffer;
41014170
41024171 // always enqueue the first two command buffers
@@ -4176,6 +4245,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41764245 for (int i = 0 ; i < ctx->n_buffers ; i++) {
41774246 [ctx->buffers[i].metal release ];
41784247 }
4248+
4249+ ggml_backend_metal_buffer_rset_free (ctx);
41794250 ggml_backend_metal_device_rel (buffer->buft ->device ->context );
41804251
41814252 if (ctx->owned ) {
@@ -4284,7 +4355,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42844355 size_aligned += (size_page - (size_aligned % size_page));
42854356 }
42864357
4287- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4358+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4359+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
42884360
42894361 ctx->all_data = ggml_metal_host_malloc (size_aligned);
42904362 ctx->all_size = size_aligned;
@@ -4307,7 +4379,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43074379 if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
43084380 GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
43094381 free (ctx);
4310- ggml_backend_metal_device_rel (buft->device ->context );
4382+ ggml_backend_metal_device_rel (ctx_dev);
4383+ return NULL ;
4384+ }
4385+
4386+ if (!ggml_backend_metal_buffer_rset_init (ctx, device)) {
4387+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4388+ free (ctx);
4389+ ggml_backend_metal_device_rel (ctx_dev);
43114390 return NULL ;
43124391 }
43134392
@@ -4400,7 +4479,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44004479 size_aligned += (size_page - (size_aligned % size_page));
44014480 }
44024481
4403- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4482+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4483+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
44044484
44054485 // the buffer fits into the max buffer size allowed by the device
44064486 if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4533,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44534533 }
44544534 }
44554535
4536+ if (!ggml_backend_metal_buffer_rset_init (ctx, device)) {
4537+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4538+ free (ctx);
4539+ ggml_backend_metal_device_rel (ctx_dev);
4540+ return NULL ;
4541+ }
4542+
44564543 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
44574544}
44584545
@@ -4766,6 +4853,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47664853 }
47674854 }
47684855
4856+ if (!ggml_backend_metal_buffer_rset_init (ctx, device)) {
4857+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4858+ free (ctx);
4859+ ggml_backend_metal_device_rel (ctx_dev);
4860+ return NULL ;
4861+ }
4862+
47694863 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
47704864}
47714865
0 commit comments