Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1533,7 +1533,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "-fdn" || arg == "--fused-delta-net") {
CHECK_ARG
params.fused_delta_net = std::stoi(argv[i]);
fprintf(stderr, "=================== %s has been deprecated\n", arg.c_str());
return true;
}
if (arg == "-smf16" || arg == "--split-mode-f16") {
Expand Down Expand Up @@ -2276,7 +2276,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-grt, --graph-reduce-type", "Type for data exchange between GPUs (default: %s)", "f32"});
options.push_back({ "*", "-smgs, --split-mode-graph-scheduling,", "Force Split Mode Graph Scheduling (default: %d)", params.split_mode_graph_scheduling});
options.push_back({ "*", "-sas, --scheduler_async,", "Async evaluation of compute graphs: %d)", params.scheduler_async});
options.push_back({ "*", "-fdn, --fused-delta-net N", "Use fused delta-net when batch size is <= N with recurrent models: %d)", params.fused_delta_net});
options.push_back({ "*", "-vq, --validate-quants", "validate quantized data while loading the model (default: %d)", params.validate_quants});
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
"in conversation mode, this will be used as system prompt\n"
Expand Down Expand Up @@ -3355,7 +3354,6 @@ struct llama_context_params common_context_params_to_llama(const gpt_params & pa
cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
//cparams.split_mode_f16 = params.split_mode_f16;
cparams.scheduler_async = params.scheduler_async;
cparams.fused_delta_net = params.fused_delta_net;
cparams.min_experts = params.min_experts;
cparams.thresh_experts = params.thresh_experts;
cparams.only_active_experts = params.only_active_exps;
Expand Down Expand Up @@ -4366,7 +4364,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
//fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false");
fprintf(stream, "reduce_type: %s # default f16\n", params.reduce_type.c_str());
fprintf(stream, "scheduler_async: %s # default: false\n", params.scheduler_async ? "true" : "false");
fprintf(stream, "fused_delta_net: %d # default: 0\n", params.fused_delta_net );
fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

Expand Down
1 change: 0 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,6 @@ struct gpt_params {
bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling
//bool split_mode_f16 = true; // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops
bool scheduler_async = false; // if true, in split mode graph the scheduler will use multiple threads to evaluate the graph
int fused_delta_net = 65536; // use fused delta-net if number of tokens in the batch is less than this value
bool has_mtp = false; // enable MTP if supported by the model

std::string cache_type_k = "f16"; // KV cache data type for the K
Expand Down
33 changes: 3 additions & 30 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,6 @@ struct cmd_params {
bool muge = false;
bool rcache = false;
bool sas = false;
int fdn = 65536; // fdn = fused delta net
bool print_overrides = false;
output_formats output_format;
output_formats output_format_stderr;
Expand Down Expand Up @@ -317,7 +316,6 @@ static const cmd_params cmd_params_defaults = {
/* muge */ false,
/* rcache */ false,
/* sas */ false,
/* fdn */ 65536,
/* print_overrides */ false,
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
Expand Down Expand Up @@ -371,7 +369,6 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -no-fug, --no-fused-up-gate <0|1> (default: %s)\n", cmd_params_defaults.no_fug? "1" : "0");
printf(" -no-ooae, --no-offload-only-active-experts <0|1> (default: %s)\n", cmd_params_defaults.no_ooae? "1" : "0");
printf(" -sas, --scheduler-async <0|1> (default: %s)\n", cmd_params_defaults.sas ? "1" : "0");
printf(" -fdn, --fused-delta-net <n> (default: %d)\n", cmd_params_defaults.fdn);
printf(" --print-overrides <0|1> (default: %s)\n", cmd_params_defaults.print_overrides ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
Expand Down Expand Up @@ -813,12 +810,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.sas = std::stoi(argv[i]);
} else if (arg == "-fdn" || arg == "--fused-delta-net") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.fdn = std::stoi(argv[i]);
} else if (arg == "-rcache" || arg == "--rope-cache") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -965,7 +956,6 @@ struct cmd_params_instance {
bool muge = false;
bool rcache = false;
bool sas = false;
int fdn = 0;
const llama_model_tensor_buft_override* buft_overrides;

llama_model_params to_llama_mparams() const {
Expand Down Expand Up @@ -1001,7 +991,6 @@ struct cmd_params_instance {
muge == other.muge &&
use_thp == other.use_thp &&
sas == other.sas &&
fdn == other.fdn &&
tensor_split == other.tensor_split;
}

Expand All @@ -1028,7 +1017,6 @@ struct cmd_params_instance {
cparams.embeddings = embeddings;
cparams.cuda_params = (void *)cuda_params.data();
cparams.scheduler_async = sas;
cparams.fused_delta_net = fdn;

return cparams;
}
Expand Down Expand Up @@ -1095,7 +1083,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1139,7 +1126,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1183,7 +1169,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1227,7 +1212,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1282,7 +1266,6 @@ struct test {
bool muge = false;
bool rcache = false;
bool sas = false;
int fdn = 0;
std::string override_tensor;
int n_prompt;
int n_gen;
Expand Down Expand Up @@ -1324,7 +1307,6 @@ struct test {
ger = inst.ger;
rcache = inst.rcache;
sas = inst.sas;
fdn = inst.fdn;
no_fug = inst.no_fug;
use_thp = inst.use_thp;
no_ooae = inst.no_ooae;
Expand Down Expand Up @@ -1429,7 +1411,7 @@ struct test {
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
field == "avg_ns" || field == "stddev_ns" || field == "fdn") {
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
Expand Down Expand Up @@ -1480,7 +1462,7 @@ struct test {
std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser), std::to_string(reuse),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(repack), std::to_string(mqkv), std::to_string(muge), std::to_string(fmoe), std::to_string(ger),
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(rcache), std::to_string(sas), std::to_string(fdn),
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(rcache), std::to_string(sas),
cuda_params, override_tensor,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
Expand All @@ -1501,7 +1483,7 @@ struct test {
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser", "reuse",
"tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "muge", "fused_moe", "grouped_er",
"no_fused_up_gate", "use_thp", "no_ooae", "rcache", "sas", "fdn", "cuda_params", "override_tensor",
"no_fused_up_gate", "use_thp", "no_ooae", "rcache", "sas", "cuda_params", "override_tensor",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts", "test",
Expand Down Expand Up @@ -1691,9 +1673,6 @@ struct markdown_printer : public printer {
if (field == "sas") {
return 3;
}
if (field == "fdn") {
return 4;
}
if (field == "use_thp") {
return 3;
}
Expand Down Expand Up @@ -1767,9 +1746,6 @@ struct markdown_printer : public printer {
if (field == "sas") {
return "sas";
}
if (field == "fdn") {
return "fdn";
}
if (field == "use_thp") {
return "thp";
}
Expand Down Expand Up @@ -1880,9 +1856,6 @@ struct markdown_printer : public printer {
if (params.sas != cmd_params_defaults.sas) {
fields.emplace_back("sas");
}
if (params.fdn != cmd_params_defaults.fdn) {
fields.emplace_back("fdn");
}
if (params.muge != cmd_params_defaults.muge) {
fields.emplace_back("muge");
}
Expand Down
1 change: 0 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,6 @@ extern "C" {
bool split_mode_graph_scheduling; // if true, force split mode graph scheduling
//bool split_mode_f16; // if true, cast intermediate results to f16 before copying to other GPUs
bool scheduler_async; // if true, with split mode "graph" graph evaluation will be done using multiple threads
int fused_delta_net;
bool mtp; // Activate MTP if supported
enum llama_mtp_op_type mtp_op_type;

Expand Down
1 change: 0 additions & 1 deletion src/llama-cparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ struct llama_cparams {
bool split_mode_graph_scheduling;
//bool split_mode_f16;
bool scheduler_async;
int fused_delta_net;
int min_experts;
float thresh_experts;
bool mtp;
Expand Down
Loading