Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
109 commits
Select commit Hold shift + click to select a range
93864cd
llama : experimental DeepSeek2 MLA implementation that caches latent …
sszymczy Jan 22, 2025
f0ce53f
Merge remote-tracking branch 'origin/master' into deepseek2-mla-exp
sszymczy Jan 24, 2025
f07c2ec
llama : add option to override tensor buffers
slaren Jan 24, 2025
de538aa
llama : optimize DeepSeek MLA implementation
sszymczy Jan 25, 2025
ce73063
llama : Update tensor names in DeepSeek2 MLA implementation.
sszymczy Jan 26, 2025
202f323
llama : add a second copy of c^KV cache in DeepSeek2 MLA to avoid tra…
sszymczy Jan 26, 2025
93c5937
llama : modified tensor permutations to multiply larger matrices duri…
sszymczy Jan 26, 2025
1eee98f
llama : removed unnecessary code in DeepSeek V2 implementation
sszymczy Jan 27, 2025
8ff0991
convert : make lint happy
sszymczy Jan 28, 2025
8a887de
llama : prompt processing optimizations in DeepSeek V2
sszymczy Jan 28, 2025
7654331
llama : avoid ggml_cont() is possible in DeepSeek V2 implementation
sszymczy Jan 30, 2025
83a473a
llama : use all experts during warmup
sszymczy Feb 1, 2025
c8bc6e4
llama : increased max_nodes as large MoE models use massive amounts o…
sszymczy Feb 1, 2025
bb6b97e
Merge remote-tracking branch 'origin/master' into sl/custom-tensor-of…
slaren Feb 2, 2025
6c8d01a
add regex support
slaren Feb 2, 2025
538f609
ggml : fix possible underflow in ggml_nbytes
slaren Feb 6, 2025
8770ffa
rebuild buft list on every call
slaren Feb 8, 2025
2e54433
Merge remote-tracking branch 'origin/master' into sl/custom-tensor-of…
slaren Feb 8, 2025
c44de8a
Merge remote-tracking branch 'origin' into sl/custom-tensor-offload
orca-zhang Feb 25, 2025
0d4ff95
can shift
orca-zhang Feb 25, 2025
e162e47
Merge remote-tracking branch 'fairydreaming/deepseek2-mla-exp' into tmp
orca-zhang Feb 25, 2025
7e4cae5
Merge remote-tracking branch 'fairydreaming/experts-warmup' into tmp
orca-zhang Feb 25, 2025
d256aa0
tmp
orca-zhang Feb 25, 2025
8c88371
Merge branch 'ggml-org:master' into tmp
orca-zhang Feb 25, 2025
42c0aa2
Merge branch 'ggml-org:master' into tmp
orca-zhang Feb 25, 2025
d13d6ff
support dynamic wkv
orca-zhang Feb 26, 2025
822807b
ignore missing
orca-zhang Feb 26, 2025
9f75d93
fix core dump
orca-zhang Feb 26, 2025
dafd46a
add debug log
orca-zhang Feb 26, 2025
6277a0e
debug
orca-zhang Feb 26, 2025
88536f7
debug log
orca-zhang Feb 26, 2025
fe68015
add log
orca-zhang Feb 26, 2025
93674de
debug
orca-zhang Feb 26, 2025
770184d
tmp
orca-zhang Feb 26, 2025
83fb5b8
tmp
orca-zhang Feb 26, 2025
7dba6fb
remove log
orca-zhang Feb 26, 2025
fd32a43
add lock-free hash_map
orca-zhang Feb 26, 2025
b092a2c
tmp
orca-zhang Feb 26, 2025
2ffbc62
fix
orca-zhang Feb 26, 2025
4c24c26
fix
orca-zhang Feb 26, 2025
21a6c92
fix
orca-zhang Feb 26, 2025
45e3f2e
fix
orca-zhang Feb 26, 2025
8b3be10
fix
orca-zhang Feb 26, 2025
eb8e058
fix
orca-zhang Feb 26, 2025
a0317cd
fix
orca-zhang Feb 26, 2025
a15e010
fix
orca-zhang Feb 26, 2025
70fb2f9
fix
orca-zhang Feb 26, 2025
ab9a13a
fix
orca-zhang Feb 26, 2025
751ff03
fix
orca-zhang Feb 26, 2025
ed3e35b
Merge branch 'ggml-org:master' into tmp
orca-zhang Feb 26, 2025
46ac9f6
revert MLA
orca-zhang Feb 26, 2025
69355a0
revert MLA
orca-zhang Feb 26, 2025
f9c292e
revert
orca-zhang Feb 26, 2025
6cccad2
fix
orca-zhang Feb 26, 2025
20e429d
add flash_attn
orca-zhang Feb 26, 2025
17cf6f8
fix warning
orca-zhang Feb 26, 2025
6627422
fix
orca-zhang Feb 26, 2025
d826821
fix
orca-zhang Feb 26, 2025
a5ca0eb
fix
orca-zhang Feb 26, 2025
4c33abe
add log
orca-zhang Feb 26, 2025
794d740
fix
orca-zhang Feb 26, 2025
d70d9f0
fix
orca-zhang Feb 26, 2025
fad3960
add log
orca-zhang Feb 26, 2025
6372f54
fix log
orca-zhang Feb 26, 2025
1eeec1c
fix prec
orca-zhang Feb 26, 2025
be5f499
add
orca-zhang Feb 26, 2025
e341ec6
fix
orca-zhang Feb 26, 2025
6305eb7
fix
orca-zhang Feb 26, 2025
ebb19c5
fix
orca-zhang Feb 26, 2025
907d09a
fix
orca-zhang Feb 26, 2025
3395a34
fix
orca-zhang Feb 26, 2025
5f5f9cd
fix
orca-zhang Feb 26, 2025
dcbce53
fix
orca-zhang Feb 27, 2025
1647e2b
fix
orca-zhang Feb 27, 2025
ef0b5c4
fix
orca-zhang Feb 27, 2025
00fd137
fix
orca-zhang Feb 27, 2025
64721f6
fix
orca-zhang Feb 27, 2025
1c49614
fix
orca-zhang Feb 27, 2025
e07b5d9
fix
orca-zhang Feb 27, 2025
f1fbc19
fix
orca-zhang Feb 27, 2025
bb532d8
fix
orca-zhang Feb 27, 2025
42908f9
fix
orca-zhang Feb 27, 2025
616218a
fix
orca-zhang Feb 27, 2025
6993e96
fix
orca-zhang Feb 27, 2025
848cade
fix
orca-zhang Feb 27, 2025
892bbc6
fix
orca-zhang Feb 27, 2025
c0827df
fix
orca-zhang Feb 27, 2025
ac4409f
fix
orca-zhang Feb 27, 2025
b1a1562
fix
orca-zhang Feb 27, 2025
b0778a6
fix
orca-zhang Feb 27, 2025
eec8dad
fix
orca-zhang Feb 27, 2025
b218b9b
fix
orca-zhang Feb 27, 2025
28471eb
fix
orca-zhang Feb 27, 2025
f298a83
fix
orca-zhang Feb 27, 2025
5001dee
fix
orca-zhang Feb 27, 2025
3dba017
fix
orca-zhang Feb 27, 2025
3f96065
fix
orca-zhang Feb 27, 2025
2dd8afc
fix
orca-zhang Feb 27, 2025
6dcfa89
fix
orca-zhang Feb 27, 2025
50c7940
fix
orca-zhang Feb 27, 2025
87f1435
fix
orca-zhang Feb 27, 2025
f8f5be1
fix
orca-zhang Feb 27, 2025
97677f7
fix
orca-zhang Feb 27, 2025
f24aed8
fix
orca-zhang Feb 28, 2025
a29ac57
fix
orca-zhang Feb 28, 2025
d19e2da
fix
orca-zhang Feb 28, 2025
a951520
tmp
orca-zhang Feb 28, 2025
f33d3ac
fix
orca-zhang Feb 28, 2025
78b0d1d
tmp
orca-zhang Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "arg.h"

#include "common.h"
#include "log.h"
#include "sampling.h"
#include "chat.h"
Expand Down Expand Up @@ -322,6 +323,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.kv_overrides.back().key[0] = 0;
}

if (!params.tensor_buft_overrides.empty()) {
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}

if (params.reranking && params.embedding) {
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
}
Expand Down Expand Up @@ -1615,6 +1620,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
exit(0);
}
));
add_opt(common_arg(
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
"override tensor buffer type", [](common_params & params, const std::string & value) {
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
auto * buft = ggml_backend_dev_buffer_type(dev);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
}
}

for (const auto & override : string_split<std::string>(value, ',')) {
std::string::size_type pos = override.find('=');
if (pos == std::string::npos) {
throw std::invalid_argument("invalid value");
}
std::string tensor_name = override.substr(0, pos);
std::string buffer_type = override.substr(pos + 1);

if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
}
throw std::invalid_argument("unknown buffer type");
}
// FIXME: this leaks memory
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
}
}
));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
Expand Down
10 changes: 10 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1082,22 +1082,32 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (!params.devices.empty()) {
mparams.devices = params.devices.data();
}

if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}

mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;

if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
mparams.kv_overrides = params.kv_overrides.data();
}

if (params.tensor_buft_overrides.empty()) {
mparams.tensor_buft_overrides = NULL;
} else {
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
}

return mparams;
}

Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ struct common_params {
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
Expand Down
2 changes: 1 addition & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5109,4 +5109,4 @@ def main() -> None:


if __name__ == '__main__':
main()
main()
Loading
Loading