@@ -7057,6 +7057,7 @@ inline void ggml_cuda_op_upscale(
70577057
70587058 (void ) src1;
70597059 (void ) dst;
7060+ (void ) src1_dd;
70607061}
70617062
70627063inline void ggml_cuda_op_pad (
@@ -7073,6 +7074,7 @@ inline void ggml_cuda_op_pad(
70737074
70747075 (void ) src1;
70757076 (void ) dst;
7077+ (void ) src1_dd;
70767078}
70777079
70787080inline void ggml_cuda_op_rms_norm (
@@ -8958,7 +8960,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
89588960
89598961 char * buf;
89608962 CUDA_CHECK (cudaMalloc (&buf, size));
8961- char * buf_host = (char *)data + offset_split;
8963+ char * buf_host = (char *)data + offset_split;
89628964
89638965 // set padding to 0 to avoid possible NaN values
89648966 if (size > original_size) {
@@ -9103,11 +9105,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
91039105
91049106 ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra ();
91059107
9106- const bool inplace = (tensor->src [0 ] != nullptr && tensor->src [0 ]->data == tensor->data ) ||
9107- tensor->op == GGML_OP_VIEW;
9108+ const bool inplace = tensor->view_src != nullptr ;
91089109
9109- if (inplace && (tensor->src [ 0 ] ->backend == GGML_BACKEND_GPU || tensor->src [ 0 ] ->backend == GGML_BACKEND_GPU_SPLIT)) {
9110- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [ 0 ] ->extra ;
9110+ if (inplace && (tensor->view_src ->backend == GGML_BACKEND_GPU || tensor->view_src ->backend == GGML_BACKEND_GPU_SPLIT)) {
9111+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src ->extra ;
91119112 char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
91129113 size_t view_offset = 0 ;
91139114 if (tensor->op == GGML_OP_VIEW) {
@@ -9431,19 +9432,25 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
94319432 GGML_ASSERT (tensor->data != NULL && " tensor not allocated" );
94329433 GGML_ASSERT (tensor->backend == GGML_BACKEND_GPU);
94339434
9434- CUDA_CHECK ( cudaMemcpy (( char *)tensor-> data + offset, data, size, cudaMemcpyHostToDevice)) ;
9435+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer-> context ;
94359436
9436- UNUSED (buffer);
9437+ ggml_cuda_set_device (ctx->device );
9438+ CUDA_CHECK (cudaDeviceSynchronize ());
9439+
9440+ CUDA_CHECK (cudaMemcpy ((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
94379441}
94389442
94399443static void ggml_backend_cuda_buffer_get_tensor (ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
94409444 GGML_ASSERT (offset + size <= ggml_nbytes (tensor) && " tensor read out of bounds" );
94419445 GGML_ASSERT (tensor->data != NULL && " tensor not allocated" );
94429446 GGML_ASSERT (tensor->backend == GGML_BACKEND_GPU);
94439447
9444- CUDA_CHECK ( cudaMemcpy (data, ( const char *)tensor-> data + offset, size, cudaMemcpyDeviceToHost)) ;
9448+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer-> context ;
94459449
9446- UNUSED (buffer);
9450+ ggml_cuda_set_device (ctx->device );
9451+ CUDA_CHECK (cudaDeviceSynchronize ());
9452+
9453+ CUDA_CHECK (cudaMemcpy (data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
94479454}
94489455
94499456static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@@ -9505,35 +9512,35 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
95059512 UNUSED (buft);
95069513}
95079514
9508- static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9515+ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
95099516 /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
95109517 /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
95119518 /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
95129519 /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
95139520};
95149521
95159522ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type (int device) {
9516- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9517- static bool ggml_backend_buffer_type_cuda_initialized = false ;
9518- if (!ggml_backend_buffer_type_cuda_initialized) {
9523+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9524+
9525+ static bool ggml_backend_cuda_buffer_type_initialized = false ;
9526+
9527+ if (!ggml_backend_cuda_buffer_type_initialized) {
95199528 for (int i = 0 ; i < GGML_CUDA_MAX_DEVICES; i++) {
9520- ggml_backend_buffer_type_cuda [i] = {
9521- /* .iface = */ cuda_backend_buffer_type_interface ,
9529+ ggml_backend_cuda_buffer_types [i] = {
9530+ /* .iface = */ ggml_backend_cuda_buffer_type_interface ,
95229531 /* .context = */ (ggml_backend_buffer_type_context_t ) (intptr_t ) i,
95239532 };
95249533 }
9525- ggml_backend_buffer_type_cuda_initialized = true ;
9534+ ggml_backend_cuda_buffer_type_initialized = true ;
95269535 }
95279536
9528- return &ggml_backend_buffer_type_cuda [device];
9537+ return &ggml_backend_cuda_buffer_types [device];
95299538}
95309539
95319540// host buffer type
95329541
95339542static void ggml_backend_cuda_host_buffer_free_buffer (ggml_backend_buffer_t buffer) {
9534- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context ;
9535- CUDA_CHECK (cudaFreeHost (ctx->dev_ptr ));
9536- delete ctx;
9543+ CUDA_CHECK (cudaFreeHost (buffer->context ));
95379544}
95389545
95399546static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size) {
@@ -9546,24 +9553,22 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
95469553 buffer->iface .free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
95479554
95489555 return buffer;
9549-
9550- UNUSED (buft);
95519556}
95529557
9553- struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9558+ struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = {
95549559 /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
95559560 /* .get_alignment = */ ggml_backend_cpu_buffer_type ()->iface .get_alignment ,
95569561 /* .get_alloc_size = */ ggml_backend_cpu_buffer_type ()->iface .get_alloc_size ,
95579562 /* .supports_backend = */ ggml_backend_cpu_buffer_type ()->iface .supports_backend ,
95589563};
95599564
95609565ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type () {
9561- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9562- /* .iface = */ cuda_backend_host_buffer_type_interface ,
9566+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9567+ /* .iface = */ ggml_backend_cuda_host_buffer_type_interface ,
95639568 /* .context = */ nullptr ,
95649569 };
95659570
9566- return &ggml_backend_buffer_type_cuda_host ;
9571+ return &ggml_backend_cuda_buffer_type_host ;
95679572}
95689573
95699574// backend
0 commit comments