Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 79 additions & 2 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,85 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]
}

GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (!tensor->extra && tensor->view_src && tensor->view_src->extra) {
// OK, this is an ugly hack, but I don't really see a way to trick the machine into correctly
// loading non-contiguous merged split tensors.
auto view_src = tensor->view_src;
auto extra = (ggml_split_tensor_t *)view_src->extra;
void * extra_ptr;
memcpy(&extra_ptr, view_src->op_params, sizeof(extra_ptr));
if (extra_ptr) {
std::string merged_name = view_src->name;
if (auto pos = merged_name.find("ffn_gate_up_exps.weight"); pos != std::string::npos) {
std::string name = tensor->name;
auto pos_u = name.find("ffn_up_exps.weight");
auto pos_g = name.find("ffn_gate_exps.weight");
if (pos_u != std::string::npos || pos_g != std::string::npos) {
GGML_ASSERT(extra->split_dim == 1);
auto & ranges = *(const std::vector<std::vector<std::pair<int,int>>> *)extra_ptr;
int ne = 0;
for (int is = 0; is < int(ranges.size()); ++is) {
auto & r = ranges[is];
GGML_ASSERT((extra->splits[is] && !r.empty()) || (!extra->splits[is] && r.empty()));
if (r.empty()) continue;
GGML_ASSERT(r.size() == 2);
auto split = extra->splits[is];
ggml_cuda_set_device(is);
int ir = pos_g != std::string::npos ? 0 : 1;
auto p = r[ir];
size_t offset = 0;
if (ir == 1) {
p.first -= tensor->ne[1];
GGML_ASSERT(p.first >= 0);
offset = split->ne[1]/2 * split->nb[1];
}
for (int i02 = 0; i02 < split->ne[2]; ++i02) {
auto dst = (char *)split->data + i02*split->nb[2] + offset;
auto src = (const char *)data + i02*tensor->nb[2] + ne*tensor->nb[1];
CUDA_CHECK(cudaMemcpyAsync(dst, src, p.second*tensor->nb[1], cudaMemcpyHostToDevice, cudaStreamPerThread));
}
ne += p.second;
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
}
return;
}
if (auto pos = merged_name.find("ffn_gate_up_exps.bias"); pos != std::string::npos) {
std::string name = tensor->name;
auto pos_u = name.find("ffn_up_exps.bias");
auto pos_g = name.find("ffn_gate_exps.bias");
if (pos_u != std::string::npos || pos_g != std::string::npos) {
GGML_ASSERT(extra->split_dim == 0);
auto & ranges = *(const std::vector<std::vector<std::pair<int,int>>> *)extra_ptr;
int ne = 0;
for (int is = 0; is < int(ranges.size()); ++is) {
auto & r = ranges[is];
GGML_ASSERT((extra->splits[is] && !r.empty()) || (!extra->splits[is] && r.empty()));
if (r.empty()) continue;
GGML_ASSERT(r.size() == 2);
auto split = extra->splits[is];
ggml_cuda_set_device(is);
int ir = pos_g != std::string::npos ? 0 : 1;
auto p = r[ir];
size_t offset = 0;
if (ir == 1) {
p.first -= tensor->ne[0];
GGML_ASSERT(p.first >= 0);
offset = split->ne[0]/2 * split->nb[0];
}
for (int i01 = 0; i01 < split->ne[1]; ++i01) {
auto dst = (char *)split->data + i01*split->nb[1] + offset;
auto src = (const char *)data + i01*tensor->nb[1] + ne*tensor->nb[0];
CUDA_CHECK(cudaMemcpyAsync(dst, src, p.second*tensor->nb[0], cudaMemcpyHostToDevice, cudaStreamPerThread));
}
ne += p.second;
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
}
return;
}
}
}
if (!tensor->extra) return;
static std::map<ggml_type, int> k_map = {
{ GGML_TYPE_Q4_0_R8 , 8},
Expand Down Expand Up @@ -886,7 +965,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
{ GGML_TYPE_Q8_KV_R8 , 4},
{ GGML_TYPE_Q8_K_R8 , 8},
};
//printf("%s(%s)\n", __func__, tensor->name);

// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
Expand Down Expand Up @@ -984,7 +1062,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
std::vector<char> host_buffer;
int ne1 = 0;
int extra_ne1 = 0;
for (int i = 0; i < extra->n_device; ++i) {
auto split = extra->splits[i];
if (!split) continue;
Expand Down
12 changes: 4 additions & 8 deletions src/llama-load-tensors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3168,6 +3168,9 @@ bool create_tensors_helper::merge_up_gate_exps(const LLM_TN & tn, int i, int bia

layer.ffn_up_gate_exps = ggml_new_tensor_3d(u_ctx, u_meta->type, u_meta->ne[0], u_meta->ne[1] + g_meta->ne[1], u_meta->ne[2]);
snprintf(layer.ffn_up_gate_exps->name, GGML_MAX_NAME, "blk.%d.ffn_gate_up_exps.weight", i);
if (u_ctx == ctx_split) {
split_tensors.insert(layer.ffn_up_gate_exps);
}
layer.ffn_gate_exps = ml.create_tensor_as_view(u_ctx, layer.ffn_up_gate_exps, g_name.c_str(),
{ g_meta->ne[0], g_meta->ne[1], g_meta->ne[2] }, 0);
layer.ffn_up_exps = ml.create_tensor_as_view(u_ctx, layer.ffn_up_gate_exps, u_name.c_str(),
Expand Down Expand Up @@ -3702,13 +3705,6 @@ bool create_tensors_helper::create_tensors() {
LLAMA_LOG_WARN("========================================================\n\n");
ml.merge_qkv = false;
}
if (ml.merge_up_gate_exps && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
LLAMA_LOG_WARN("\n========================================================\n");
LLAMA_LOG_WARN("merge_up_gate_exps is not compatible with split mode 'graph'\n");
LLAMA_LOG_WARN(" => turning off merge_up_gate_exps\n");
LLAMA_LOG_WARN("========================================================\n\n");
ml.merge_up_gate_exps = false;
}
switch (model.arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_REFACT:
Expand Down Expand Up @@ -4039,7 +4035,7 @@ bool create_tensors_helper::create_tensors() {
prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps, up_gate_split, mem_used);
prepare_up_gate_split(layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps);
if (layer.ffn_up_gate_exps_b) {
prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b, up_gate_split, mem_used);
prepare_split_tensors(0, ctx_split, layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b, up_gate_split, mem_used);
prepare_up_gate_split(layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b);
}
} else {
Expand Down
4 changes: 3 additions & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5203,7 +5203,9 @@ struct llama_context * llama_init_from_model(
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
}

llama_repack_up_gate_exps(*ctx);
if (ctx->model.split_mode != LLAMA_SPLIT_MODE_GRAPH) {
llama_repack_up_gate_exps(*ctx);
}

// build worst-case graph
int n_past = cparams.n_ctx - n_tokens;
Expand Down