Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8e6735e
llama : initial ggml-backend integration
slaren Dec 17, 2023
0808aa5
add ggml-metal
slaren Dec 19, 2023
9450791
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
slaren Dec 19, 2023
0c5ee7c
cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_…
slaren Dec 19, 2023
1ac01fb
add ggml_backend_buffer_clear
slaren Dec 19, 2023
c8bd5d8
add ggml_backend_buffer_is_hos, used to avoid copies if possible when…
slaren Dec 19, 2023
72a0c96
disable gpu backends with ngl 0
slaren Dec 20, 2023
d3e7242
more accurate mlock
slaren Dec 20, 2023
c3678ca
unmap offloaded part of the model
slaren Dec 20, 2023
5241045
use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance…
slaren Dec 20, 2023
bcd87ca
update quantize and lora
slaren Dec 20, 2023
24cc321
update session copy/set to use ggml-backend
slaren Dec 20, 2023
f70f94d
use posix_fadvise instead of posix_fadvise64
slaren Dec 20, 2023
6c045a8
ggml_backend_alloc_ctx_tensors_from_buft : remove old print
slaren Dec 20, 2023
5834a25
llama_mmap::align_offset : use pointers instead of references for out…
slaren Dec 20, 2023
ecb23d4
restore progress_callback behavior
slaren Dec 20, 2023
8ed2a8e
move final progress_callback call to load_all_data
slaren Dec 20, 2023
a4e191f
cuda : fix fprintf format string (minor)
ggerganov Dec 21, 2023
a74b1a8
do not offload scales
slaren Dec 21, 2023
6a72c7f
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
slaren Dec 21, 2023
cd4167b
llama_mmap : avoid unmapping the same fragments again in the destructor
slaren Dec 21, 2023
16582cd
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
slaren Dec 21, 2023
323881e
remove unnecessary unmap
slaren Dec 21, 2023
f4d884f
metal : add default log function that prints to stderr, cleanup code
slaren Dec 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ test: $(TEST_TARGETS)
./$$test_target; \
fi; \
if [ $$? -ne 0 ]; then \
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
printf 'Test %s FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
Expand Down
11 changes: 7 additions & 4 deletions ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
if (update_backend) {
view->backend = view->view_src->backend;
}
view->buffer = view->view_src->buffer;
// views are initialized in the alloc buffer rather than the view_src buffer
view->buffer = alloc->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);

if (!alloc->measure) {
Expand Down Expand Up @@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
}

void ggml_allocr_free(ggml_allocr_t alloc) {
if (alloc == NULL) {
return;
}

ggml_gallocr_free(alloc->galloc);
ggml_tallocr_free(alloc->talloc);
free(alloc);
Expand Down Expand Up @@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
}

if (nbytes == 0) {
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
//fprintf(stderr, "%s: no tensors to allocate\n", __func__);
return NULL;
}

Expand Down
49 changes: 45 additions & 4 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {

static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
Expand Down Expand Up @@ -456,7 +455,7 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
}

ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ {
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
Expand All @@ -466,8 +465,50 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .context = */ NULL,
};

return &ggml_backend_buffer_type_cpu;
return &ggml_backend_cpu_buffer_type;
}

#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>

// HBM buffer type
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}

static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
//void * ptr = hbw_malloc(size);
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}

// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;

return buffer;
}

struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
};

ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ cpu_backend_hbm_buffer_type_interface,
/* .context = */ NULL,
};

return &ggml_backend_cpu_buffer_type_hbm;
}
#endif

struct ggml_backend_cpu_context {
int n_threads;
Expand Down Expand Up @@ -505,7 +546,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cgraph = *cgraph;
cpu_plan->cgraph = *cgraph; // FIXME: deep copy

if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
Expand Down
4 changes: 4 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ extern "C" {

GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif

//
// Backend registry
//
Expand Down
63 changes: 29 additions & 34 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9005,7 +9005,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {

char * buf;
CUDA_CHECK(cudaMalloc(&buf, size));
char * buf_host = (char*)data + offset_split;
char * buf_host = (char *)data + offset_split;

// set padding to 0 to avoid possible NaN values
if (size > original_size) {
Expand Down Expand Up @@ -9150,11 +9150,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)

ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();

const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW;
const bool inplace = tensor->view_src != nullptr;

if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t view_offset = 0;
if (tensor->op == GGML_OP_VIEW) {
Expand Down Expand Up @@ -9474,23 +9473,25 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
}

static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);

CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;

UNUSED(buffer);
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());

CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
}

static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);

CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;

UNUSED(buffer);
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());

CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
}

static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
Expand Down Expand Up @@ -9552,35 +9553,35 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
UNUSED(buft);
}

static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
};

ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
static bool ggml_backend_buffer_type_cuda_initialized = false;
if (!ggml_backend_buffer_type_cuda_initialized) {
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];

static bool ggml_backend_cuda_buffer_type_initialized = false;

if (!ggml_backend_cuda_buffer_type_initialized) {
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
ggml_backend_buffer_type_cuda[i] = {
/* .iface = */ cuda_backend_buffer_type_interface,
ggml_backend_cuda_buffer_types[i] = {
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
};
}
ggml_backend_buffer_type_cuda_initialized = true;
ggml_backend_cuda_buffer_type_initialized = true;
}

return &ggml_backend_buffer_type_cuda[device];
return &ggml_backend_cuda_buffer_types[device];
}

// host buffer type

static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
delete ctx;
CUDA_CHECK(cudaFreeHost(buffer->context));
}

static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
Expand All @@ -9593,24 +9594,22 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

return buffer;

UNUSED(buft);
}

struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
};

ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
/* .iface = */ cuda_backend_host_buffer_type_interface,
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
/* .iface = */ ggml_backend_cuda_host_buffer_type_interface,
/* .context = */ nullptr,
};

return &ggml_backend_buffer_type_cuda_host;
return &ggml_backend_cuda_buffer_type_host;
}

// backend
Expand Down Expand Up @@ -9642,8 +9641,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;

GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);

CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
Expand All @@ -9653,8 +9650,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;

GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);

CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
Expand Down
3 changes: 3 additions & 0 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);

GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);

GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);

GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

// helper to check if the device supports a specific family
Expand Down
Loading