Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ggml/src/ggml-metal/ggml-metal-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,22 @@ typedef struct ggml_metal * ggml_metal_t;
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev);
void ggml_metal_free(ggml_metal_t ctx);

const char * ggml_metal_get_name(ggml_metal_t ctx);

void ggml_metal_synchronize(ggml_metal_t ctx);

void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf);
void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf);

void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev);
void ggml_metal_event_wait (ggml_metal_t ctx, ggml_metal_event_t ev);

ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx);

void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb);
void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data);
bool ggml_metal_supports_family (ggml_metal_t ctx, int family);
Expand Down
105 changes: 99 additions & 6 deletions ggml/src/ggml-metal/ggml-metal-context.m
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@
};

struct ggml_metal {
char name[128];

ggml_metal_device_t dev;
ggml_metal_library_t lib;

ggml_metal_event_t ev_cpy; // for async copies

dispatch_queue_t d_queue;

// additional, inference-time compiled pipelines
Expand Down Expand Up @@ -117,7 +121,11 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
}
}

//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
res->ev_cpy = ggml_metal_device_event_init(dev);

const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);

snprintf(res->name, sizeof(res->name), "%s", props_dev->name);

res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

Expand Down Expand Up @@ -206,9 +214,15 @@ void ggml_metal_free(ggml_metal_t ctx) {

dispatch_release(ctx->d_queue);

ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy);

free(ctx);
}

const char * ggml_metal_get_name(ggml_metal_t ctx) {
return ctx->name;
}

void ggml_metal_synchronize(ggml_metal_t ctx) {
// wait for any backend operations to finish
if (ctx->cmd_buf_last) {
Expand Down Expand Up @@ -273,8 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
// wrap the source data into a Metal buffer
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
length:size
options:MTLResourceStorageModeShared];
length:size
options:MTLResourceStorageModeShared];

GGML_ASSERT(buf_src);

Expand Down Expand Up @@ -316,9 +330,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
@autoreleasepool {
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
length:size
options:MTLResourceStorageModeShared
deallocator:nil];

GGML_ASSERT(buf_dst);

Expand Down Expand Up @@ -356,6 +370,49 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
}
}

bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@autoreleasepool {
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src);
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst);

if (bid_src.metal == nil || bid_dst.metal == nil) {
return false;
}

// queue the copy operation into the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx_src->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs
size:ggml_nbytes(src)];

[encoder endEncoding];

ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
ggml_metal_event_record(ctx_src, ev_cpy);

[cmd_buf commit];

// do not wait here for completion
//[cmd_buf waitUntilCompleted];

// instead, remember a reference to the command buffer and wait for it later if needed
[ctx_src->cmd_bufs_ext addObject:cmd_buf];
ctx_src->cmd_buf_last = cmd_buf;

[cmd_buf retain];

ggml_metal_event_wait(ctx_dst, ev_cpy);

return true;
}
}

enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
// number of nodes encoded by the main thread (empirically determined)
const int n_main = 64;
Expand Down Expand Up @@ -530,6 +587,42 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
//printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
}

void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];

ggml_metal_event_encode_signal(ev, cmd_buf);

[cmd_buf commit];

[ctx->cmd_bufs_ext addObject:cmd_buf];
ctx->cmd_buf_last = cmd_buf;

[cmd_buf retain];
}
}

void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];

ggml_metal_event_encode_wait(ev, cmd_buf);

[cmd_buf commit];

[ctx->cmd_bufs_ext addObject:cmd_buf];
ctx->cmd_buf_last = cmd_buf;

[cmd_buf retain];
}
}

ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) {
return ctx->ev_cpy;
}

void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
if (ctx->n_cb != n_cb) {
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
Expand Down
8 changes: 5 additions & 3 deletions ggml/src/ggml-metal/ggml-metal-device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ struct ggml_metal_device_deleter {

typedef std::unique_ptr<ggml_metal_device, ggml_metal_device_deleter> ggml_metal_device_ptr;

ggml_metal_device_t ggml_metal_device_get(void) {
static ggml_metal_device_ptr ctx { ggml_metal_device_init() };
ggml_metal_device_t ggml_metal_device_get(int device) {
static std::vector<ggml_metal_device_ptr> devs;

return ctx.get();
devs.emplace_back(ggml_metal_device_init(device));

return devs.back().get();
}

struct ggml_metal_pipelines {
Expand Down
16 changes: 13 additions & 3 deletions ggml/src/ggml-metal/ggml-metal-device.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,9 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
//

struct ggml_metal_device_props {
int device;
char name[128];
char desc[128];

size_t max_buffer_size;
size_t max_working_set_size;
Expand All @@ -224,11 +226,15 @@ struct ggml_metal_device_props {
int op_offload_min_batch_size;
};

ggml_metal_device_t ggml_metal_device_init(void);
typedef struct ggml_metal_event * ggml_metal_event_t;

void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
void ggml_metal_event_encode_wait (ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);

ggml_metal_device_t ggml_metal_device_init(int device);
void ggml_metal_device_free(ggml_metal_device_t dev);

// return a singleton that is automatically destroyed when the program exits
ggml_metal_device_t ggml_metal_device_get(void);
ggml_metal_device_t ggml_metal_device_get(int device);

void * ggml_metal_device_get_obj (ggml_metal_device_t dev); // id<MTLDevice>
void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQueue>
Expand All @@ -240,6 +246,10 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset

void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);

ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev);
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev);
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev);

void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);

Expand Down
71 changes: 64 additions & 7 deletions ggml/src/ggml-metal/ggml-metal-device.m
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;

// virtual address for GPU memory allocations
static atomic_uintptr_t g_addr_device = 0x000000400ULL;

#if !GGML_METAL_EMBED_LIBRARY
// Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
Expand Down Expand Up @@ -523,6 +520,9 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) {
ggml_metal_library_t library;

struct ggml_metal_device_props props;

// virtual address for GPU memory allocations
atomic_uintptr_t addr_virt;
};

//
Expand Down Expand Up @@ -618,7 +618,7 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
free(rsets);
}

ggml_metal_device_t ggml_metal_device_init(void) {
ggml_metal_device_t ggml_metal_device_init(int device) {
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));

assert(dev != NULL);
Expand All @@ -632,6 +632,9 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
}

dev->addr_virt = 0x000000400ULL;

dev->props.device = device;
dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];

Expand Down Expand Up @@ -792,7 +795,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
}

strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);

dev->library = ggml_metal_library_init(dev);
if (!dev->library) {
Expand Down Expand Up @@ -922,6 +926,59 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
}

struct ggml_metal_event {
void * obj; // id<MTLEvent>

atomic_int value;
};

void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;

id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

[cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
}

void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;

id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
}

ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
id<MTLEvent> event = [dev->mtl_device newEvent];

ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));

ev->obj = (__bridge void *)event;
ev->value = 0;

return ev;
}

void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
id<MTLEvent> event = ev->obj;
[event release];

free(ev);

GGML_UNUSED(dev);
}

void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLEvent> event = ev->obj;

id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
}
}

void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
if (@available(macOS 10.12, iOS 16.0, *)) {
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
Expand Down Expand Up @@ -1344,8 +1401,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
res->all_data = ggml_metal_host_malloc(size_aligned);
res->is_shared = true;
} else {
// use virtual address from g_addr_device counter
res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
// use virtual address
res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
res->is_shared = false;
}
res->all_size = size_aligned;
Expand Down
Loading