-
Couldn't load subscription status.
- Fork 13.4k
CLBlast support #1164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
CLBlast support #1164
Changes from 10 commits
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
a908c37
Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS …
0cc4m b7143c1
Improve ClBlast implementation, avoid recreating buffers, remove redu…
0cc4m 6f66870
Finish merge of ClBlast support
0cc4m 1b16b8c
Move CLBlast implementation to separate file
0cc4m 309af7f
Add q4_2 and q4_3 CLBlast support, improve code
0cc4m f469d9a
Double CLBlast speed by disabling OpenBLAS thread workaround
0cc4m 8603c25
Fix device selection env variable names
0cc4m 18cc05b
Fix cast in opencl kernels
0cc4m ae73887
Add CLBlast to CMakeLists.txt
0cc4m daa5df5
Replace buffer pool with static buffers a, b, qb, c
0cc4m 36bfb3c
Fix typos, use GGML_TYPE defines, improve code
0cc4m 1370710
Improve btype dequant kernel selection code, add error if type is uns…
0cc4m 2b0c6a5
Improve code quality
0cc4m b746458
Use c compiler for opencl files
0cc4m ce97a80
Simplify code, fix include
0cc4m 4a35ec9
First check error, then release event
0cc4m fafebff
Make globals static, fix indentation
0cc4m 96346fb
Rename dequant kernels file to conform with other file names
0cc4m bbfba5f
Fix import cl file name
0cc4m 4530d5c
Merge branch 'master' into clblast-llama-cpp
ggerganov File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,200 @@ | ||
| #include "ggml-opencl.h" | ||
|
|
||
| #include <atomic> | ||
| #include <cstdio> | ||
| #include <cstring> | ||
|
|
||
| #include "ggml.h" | ||
|
|
||
| #include <ggml_clblast_dequant.cl> | ||
|
|
||
| cl_platform_id platform; | ||
| cl_device_id device; | ||
| cl_context context; | ||
| cl_command_queue queue; | ||
| cl_program program; | ||
| cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3; | ||
| cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; | ||
| size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; | ||
|
|
||
| cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { | ||
| cl_program program; | ||
| char *program_log; | ||
| size_t program_size, log_size; | ||
| int err; | ||
|
|
||
| program_size = strlen(program_buffer); | ||
|
|
||
| program = clCreateProgramWithSource(ctx, 1, | ||
| (const char**)&program_buffer, &program_size, &err); | ||
| if(err < 0) { | ||
| fprintf(stderr, "OpenCL error creating program"); | ||
| exit(1); | ||
| } | ||
|
|
||
| err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); | ||
| if(err < 0) { | ||
|
|
||
| clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, | ||
| 0, NULL, &log_size); | ||
| program_log = (char*) malloc(log_size + 1); | ||
| program_log[log_size] = '\0'; | ||
| clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, | ||
| log_size + 1, program_log, NULL); | ||
| printf("%s\n", program_log); | ||
| free(program_log); | ||
| exit(1); | ||
| } | ||
|
|
||
| return program; | ||
| } | ||
|
|
||
| void ggml_cl_init(void) { | ||
| cl_int err = 0; | ||
| char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); | ||
| char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); | ||
| int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); | ||
| int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); | ||
| printf("\nInitializing CLBlast (First Run)..."); | ||
| printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); | ||
| cl_uint num_platforms; | ||
| clGetPlatformIDs(0, NULL, &num_platforms); | ||
| cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); | ||
| clGetPlatformIDs(num_platforms, platforms, NULL); | ||
| platform = platforms[plat_num]; | ||
| char platform_buffer[1024]; | ||
| clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); | ||
| cl_uint num_devices; | ||
| clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); | ||
| cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); | ||
| clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); | ||
| device = devices[dev_num]; | ||
| char device_buffer[1024]; | ||
| clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); | ||
| printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer); | ||
| context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); | ||
| CL_CHECK(err, "clCreateContext"); | ||
| queue = clCreateCommandQueue(context, device, 0, &err); | ||
| CL_CHECK(err, "clCreateCommandQueue"); | ||
|
|
||
| free(platforms); | ||
| free(devices); | ||
|
|
||
| program = build_program_from_source(context, device, clblast_dequant); | ||
|
|
||
| // Prepare dequantize kernels | ||
| kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err); | ||
| CL_CHECK(err, "clCreateKernel"); | ||
| } | ||
|
|
||
| void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { | ||
| if (req_size <= *cur_size) { | ||
| return; | ||
| } | ||
|
|
||
| // Reallocate buffer with enough space | ||
| if (*cur_size > 0) { | ||
| clReleaseMemObject(*buf); | ||
| } | ||
| cl_int err; | ||
| *buf = clCreateBuffer(context, flags, req_size, NULL, &err); | ||
| *cur_size = req_size; | ||
| CL_CHECK(err, "clCreateBuffer"); | ||
| } | ||
|
|
||
| void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose trans_a, const CLBlastTranspose trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype) { | ||
| cl_int err = 0; | ||
|
|
||
| cl_event events[4]; | ||
| events[0] = NULL; | ||
| events[1] = NULL; | ||
| events[2] = NULL; | ||
| events[3] = NULL; | ||
0cc4m marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| cl_kernel kernel; | ||
| size_t global, local, size_qb; | ||
| const bool dequant = btype >= 2 && btype < 6; | ||
| if (dequant) { | ||
| global = n * k; | ||
|
|
||
| switch (btype) { | ||
| case 2: | ||
| kernel = kernel_q4_0; | ||
| local = 16; | ||
| size_qb = global * (sizeof(float) + local) / 32; | ||
| break; | ||
| case 3: | ||
| kernel = kernel_q4_1; | ||
| local = 16; | ||
| size_qb = global * (sizeof(float) * 2 + local) / 32; | ||
| break; | ||
| case 4: | ||
| kernel = kernel_q4_2; | ||
| local = 8; | ||
| size_qb = global * (sizeof(short) + local) / 16; | ||
| break; | ||
| case 5: | ||
| kernel = kernel_q4_3; | ||
| local = 8; | ||
| size_qb = global * (sizeof(short) * 2 + local) / 16; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| const size_t size_a = m * k * sizeof(float); | ||
| const size_t size_b = n * k * sizeof(float); | ||
| const size_t size_c = m * n * sizeof(float); | ||
|
|
||
| // Prepare buffers | ||
| ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); | ||
| if (dequant) { | ||
| ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); | ||
| } | ||
| ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); | ||
| ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); | ||
|
|
||
| if (dequant) { | ||
| err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); | ||
| err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); | ||
| CL_CHECK(err, "clSetKernelArg"); | ||
| clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, events + 1); | ||
| } else { | ||
| clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, events + 1); | ||
| } | ||
|
|
||
| clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, events); | ||
| if (dequant) { | ||
| err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, events + 1, events + 3); | ||
| CL_CHECK(err, "clEnqueueNDRangeKernel"); | ||
| } | ||
| clWaitForEvents(dequant ? 4 : 3, events); | ||
| clReleaseEvent(events[0]); | ||
| clReleaseEvent(events[1]); | ||
| clReleaseEvent(events[2]); | ||
| if (dequant) { | ||
| clReleaseEvent(events[3]); | ||
| } | ||
|
|
||
| CLBlastSgemm(order, | ||
| trans_a, trans_b, | ||
| m, n, k, | ||
| alpha, | ||
| cl_buffer_a, 0, lda, | ||
| cl_buffer_b, 0, ldb, | ||
| beta, | ||
| cl_buffer_c, 0, ldc, | ||
| &queue, events); | ||
|
|
||
| clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, events, events + 1); | ||
|
|
||
| // Wait for completion | ||
| clWaitForEvents(2, events); | ||
| clReleaseEvent(events[0]); | ||
| clReleaseEvent(events[1]); | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| #pragma once | ||
|
|
||
| #define CL_TARGET_OPENCL_VERSION 110 | ||
| #include <clblast_c.h> | ||
| #define MAX_CL_BUFFERS 16 | ||
|
|
||
| #ifdef __cplusplus | ||
| extern "C" { | ||
| #endif | ||
|
|
||
| // Buffer reuse code adapted from cuda implementation by slaren | ||
| #define CL_CHECK(err, name) \ | ||
| do { \ | ||
| cl_int err_ = (err); \ | ||
| if (err_ != CL_SUCCESS) { \ | ||
| fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ | ||
| exit(1); \ | ||
| } \ | ||
| } while (0) | ||
|
|
||
| cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size); | ||
| void ggml_cl_pool_free(cl_mem mem, size_t size); | ||
|
|
||
| cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer); | ||
| void ggml_cl_init(void); | ||
|
|
||
| void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose trans_a, const CLBlastTranspose trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); | ||
|
|
||
| #ifdef __cplusplus | ||
| } | ||
| #endif |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.