-
Notifications
You must be signed in to change notification settings - Fork 13.4k
CLBlast support #1164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CLBlast support #1164
Changes from 13 commits
a908c37
b7143c1
6f66870
1b16b8c
309af7f
f469d9a
8603c25
18cc05b
ae73887
daa5df5
36bfb3c
1370710
2b0c6a5
b746458
ce97a80
4a35ec9
fafebff
96346fb
bbfba5f
4530d5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,221 @@ | ||
| #include "ggml-opencl.h" | ||
|
|
||
| #define CL_TARGET_OPENCL_VERSION 110 | ||
| #include <clblast_c.h> | ||
|
|
||
| #include <stdio.h> | ||
| #include <string.h> | ||
|
|
||
| #include "ggml.h" | ||
|
|
||
| #include <ggml_clblast_dequant.cl> | ||
|
|
||
| #define CL_CHECK(err, name) \ | ||
| do { \ | ||
| cl_int err_ = (err); \ | ||
| if (err_ != CL_SUCCESS) { \ | ||
| fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ | ||
| exit(1); \ | ||
| } \ | ||
| } while (0) | ||
|
|
||
| cl_platform_id platform; | ||
| cl_device_id device; | ||
| cl_context context; | ||
| cl_command_queue queue; | ||
| cl_program program; | ||
| cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3; | ||
| cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; | ||
| size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; | ||
0cc4m marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { | ||
| cl_program program; | ||
| char *program_log; | ||
| size_t program_size, log_size; | ||
| int err; | ||
|
|
||
| program_size = strlen(program_buffer); | ||
|
|
||
| program = clCreateProgramWithSource(ctx, 1, | ||
| (const char**)&program_buffer, &program_size, &err); | ||
| if(err < 0) { | ||
| fprintf(stderr, "OpenCL error creating program"); | ||
| exit(1); | ||
| } | ||
|
|
||
| err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); | ||
| if(err < 0) { | ||
|
|
||
| clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, | ||
| 0, NULL, &log_size); | ||
| program_log = (char*) malloc(log_size + 1); | ||
| program_log[log_size] = '\0'; | ||
| clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, | ||
| log_size + 1, program_log, NULL); | ||
| printf("%s\n", program_log); | ||
| free(program_log); | ||
| exit(1); | ||
| } | ||
|
|
||
| return program; | ||
| } | ||
|
|
||
| void ggml_cl_init(void) { | ||
| cl_int err = 0; | ||
| char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); | ||
| char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); | ||
| int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); | ||
| int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); | ||
| printf("\nInitializing CLBlast (First Run)..."); | ||
| printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); | ||
| cl_uint num_platforms; | ||
| clGetPlatformIDs(0, NULL, &num_platforms); | ||
| cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); | ||
| clGetPlatformIDs(num_platforms, platforms, NULL); | ||
| platform = platforms[plat_num]; | ||
| char platform_buffer[1024]; | ||
| clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); | ||
| cl_uint num_devices; | ||
| clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); | ||
| cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); | ||
| clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); | ||
| device = devices[dev_num]; | ||
| char device_buffer[1024]; | ||
| clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); | ||
| printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It takes 9 seconds to go from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does not take that long for me, but this depends on your OpenCL (graphics) driver. |
||
| context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); | ||
| CL_CHECK(err, "clCreateContext"); | ||
| queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); | ||
| CL_CHECK(err, "clCreateCommandQueue"); | ||
|
|
||
| free(platforms); | ||
| free(devices); | ||
|
|
||
| program = build_program_from_source(context, device, clblast_dequant); | ||
|
|
||
| // Prepare dequantize kernels | ||
| kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| } | ||
|
|
||
| static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { | ||
| if (req_size <= *cur_size) { | ||
| return; | ||
| } | ||
|
|
||
| // Reallocate buffer with enough space | ||
| if (*cur_size > 0) { | ||
| clReleaseMemObject(*buf); | ||
| } | ||
| cl_int err; | ||
| *buf = clCreateBuffer(context, flags, req_size, NULL, &err); | ||
| *cur_size = req_size; | ||
| CL_CHECK(err, "clCreateBuffer"); | ||
| } | ||
|
|
||
| void ggml_cl_sgemm_wrapper( | ||
| const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, | ||
| const int m, const int n, const int k, | ||
| const float alpha, const void *host_a, const int lda, | ||
| const float *host_b, const int ldb, const float beta, | ||
| float *host_c, const int ldc, const int btype) { | ||
| cl_int err = 0; | ||
|
|
||
| cl_kernel kernel; | ||
| size_t global = n * k, local, size_qb; | ||
| bool dequant; | ||
|
|
||
| switch (btype) { | ||
| case GGML_TYPE_F32: | ||
| dequant = false; | ||
| break; | ||
| case GGML_TYPE_Q4_0: | ||
| dequant = true; | ||
| kernel = kernel_q4_0; | ||
| local = 16; | ||
| size_qb = global * (sizeof(float) + local) / 32; | ||
| break; | ||
| case GGML_TYPE_Q4_1: | ||
| dequant = true; | ||
| kernel = kernel_q4_1; | ||
| local = 16; | ||
| size_qb = global * (sizeof(float) * 2 + local) / 32; | ||
| break; | ||
| case GGML_TYPE_Q4_2: | ||
| dequant = true; | ||
| kernel = kernel_q4_2; | ||
| local = 8; | ||
| size_qb = global * (sizeof(short) + local) / 16; | ||
| break; | ||
| case GGML_TYPE_Q4_3: | ||
| dequant = true; | ||
| kernel = kernel_q4_3; | ||
| local = 8; | ||
| size_qb = global * (sizeof(short) * 2 + local) / 16; | ||
| break; | ||
| default: | ||
| fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); | ||
| abort(); | ||
| } | ||
|
|
||
| const size_t size_a = m * k * sizeof(float); | ||
| const size_t size_b = n * k * sizeof(float); | ||
| const size_t size_c = m * n * sizeof(float); | ||
|
|
||
| // Prepare buffers | ||
| ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); | ||
| if (dequant) { | ||
| ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); | ||
| } | ||
| ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); | ||
| ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); | ||
|
|
||
| cl_event ev_a, ev_qb, ev_b; | ||
|
|
||
| if (dequant) { | ||
| err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); | ||
| err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); | ||
| CL_CHECK(err, "clSetKernelArg"); | ||
| clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); | ||
| } else { | ||
| clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); | ||
| } | ||
|
|
||
| clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); | ||
| if (dequant) { | ||
| err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); | ||
| CL_CHECK(err, "clEnqueueNDRangeKernel"); | ||
| } | ||
| clWaitForEvents(1, &ev_a); | ||
| clWaitForEvents(1, &ev_b); | ||
| clReleaseEvent(ev_a); | ||
| clReleaseEvent(ev_b); | ||
| if (dequant) { | ||
| clReleaseEvent(ev_qb); | ||
|
||
| } | ||
|
|
||
| cl_event ev_sgemm; | ||
| CLBlastSgemm((CLBlastLayout)order, | ||
| (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, | ||
| m, n, k, | ||
| alpha, | ||
| cl_buffer_a, 0, lda, | ||
| cl_buffer_b, 0, ldb, | ||
| beta, | ||
| cl_buffer_c, 0, ldc, | ||
| &queue, &ev_sgemm); | ||
|
|
||
| cl_event ev_c; | ||
| clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); | ||
|
|
||
| // Wait for completion | ||
| clWaitForEvents(1, &ev_c); | ||
| clReleaseEvent(ev_sgemm); | ||
| clReleaseEvent(ev_c); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| #pragma once | ||
|
|
||
| #ifdef __cplusplus | ||
| extern "C" { | ||
| #endif | ||
|
|
||
| void ggml_cl_init(void); | ||
|
|
||
| enum ggml_blas_order { | ||
| GGML_BLAS_ORDER_ROW_MAJOR = 101, | ||
| GGML_BLAS_ORDER_COLUMN_MAJOR = 102, | ||
| }; | ||
|
|
||
| enum ggml_blas_op { | ||
| GGML_BLAS_OP_N = 111, | ||
| GGML_BLAS_OP_T = 112, | ||
| GGML_BLAS_OP_C = 113, | ||
| }; | ||
|
|
||
| void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); | ||
|
|
||
| #ifdef __cplusplus | ||
| } | ||
| #endif |
Uh oh!
There was an error while loading. Please reload this page.