Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions ggml/src/ggml-metal/ggml-metal-context.m
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
// additional, inference-time compiled pipelines
ggml_metal_pipelines_t pipelines_ext;

bool use_bfloat;
bool use_fusion;
bool use_concurrency;
bool use_graph_optimize;
Expand Down Expand Up @@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
}
}

const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);

res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

res->use_bfloat = props_dev->has_bfloat;
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;

Expand All @@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {

memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));

GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-metal/ggml-metal-device.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);

typedef struct ggml_metal_library * ggml_metal_library_t;

ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
ggml_metal_library_t ggml_metal_library_init (ggml_metal_device_t dev);
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose);

void ggml_metal_library_free(ggml_metal_library_t lib);

ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
Expand Down Expand Up @@ -193,6 +195,7 @@ struct ggml_metal_device_props {
bool has_simdgroup_mm;
bool has_unified_memory;
bool has_bfloat;
bool has_tensor;
bool use_residency_sets;
bool use_shared_buffers;

Expand Down
124 changes: 122 additions & 2 deletions ggml/src/ggml-metal/ggml-metal-device.m
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@
#define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif

// overload of MTLGPUFamilyMetal3 (not available in some environments)
// overload of MTLGPUFamilyMetalX (not available in some environments)
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;

// virtual address for GPU memory allocations
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
Expand Down Expand Up @@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
}

if (ggml_metal_device_get_props(dev)->has_tensor) {
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
}

#if GGML_METAL_EMBED_LIBRARY
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
#endif
Expand Down Expand Up @@ -298,6 +303,72 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
return res;
}

ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
if (source == NULL) {
GGML_LOG_ERROR("%s: source is NULL\n", __func__);
return NULL;
}

id<MTLDevice> device = ggml_metal_device_get_obj(dev);
id<MTLLibrary> library = nil;
NSError * error = nil;

const int64_t t_start = ggml_time_us();

NSString * src = [[NSString alloc] initWithBytes:source
length:strlen(source)
encoding:NSUTF8StringEncoding];
if (!src) {
GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
return NULL;
}

@autoreleasepool {
NSMutableDictionary * prep = [NSMutableDictionary dictionary];

MTLCompileOptions * options = [MTLCompileOptions new];
options.preprocessorMacros = prep;

library = [device newLibraryWithSource:src options:options error:&error];
if (error) {
if (verbose) {
GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
} else {
GGML_LOG_ERROR("%s: error compiling source\n", __func__);
}
library = nil;
}

[options release];
}

[src release];

if (!library) {
if (verbose) {
GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
}

return NULL;
}

if (verbose) {
GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
}

ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
if (!res) {
GGML_LOG_ERROR("%s: calloc failed\n", __func__);
return NULL;
}

res->obj = library;
res->device = device;
res->pipelines = ggml_metal_pipelines_init();

return res;
}

void ggml_metal_library_free(ggml_metal_library_t lib) {
if (!lib) {
return;
Expand Down Expand Up @@ -469,14 +540,62 @@ ggml_metal_device_t ggml_metal_device_init(void) {

dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
dev->props.has_bfloat = false;
}

dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
dev->props.has_tensor = false;
}

// try to compile a dummy tensor kernel to determine if the tensor API is supported for bfloat
if (dev->props.has_tensor && dev->props.has_bfloat) {
const char * src_tensor_bf16 = "\n"
"#include <metal_stdlib> \n"
"#include <metal_tensor> \n"
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
" \n"
"using namespace metal; \n"
"using namespace mpp::tensor_ops; \n"
" \n"
"kernel void bfloat_dummy_kernel( \n"
" tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
" tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
"{ \n"
" // Create slices for this threadgroup (no real computation performed). \n"
" auto tA = A.slice(0, (int)tgid.y); \n"
" auto tB = B.slice((int)tgid.x, 0); \n"
" \n"
" // Minimal matmul descriptor: 8×8 tile with dynamic K dimension. \n"
" matmul2d< \n"
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
" execution_thread> mm; \n"
" \n"
" // Obtain a cooperative destination tensor of bfloat type. \n"
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), bfloat>(); \n"
" \n"
" // Silence “unused variable” warnings. \n"
" (void)cT; \n"
"}";

GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
if (lib == NULL) {
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
dev->props.has_bfloat = false;
} else {
ggml_metal_library_free(lib);
}
}

dev->props.use_residency_sets = true;
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
#endif

dev->props.use_shared_buffers = dev->props.has_unified_memory;

if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
dev->props.use_shared_buffers = false;
}
Expand Down Expand Up @@ -529,6 +648,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");

Expand Down
Loading