Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1533,7 +1533,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "-fdn" || arg == "--fused-delta-net") {
CHECK_ARG
params.fused_delta_net = std::stoi(argv[i]);
fprintf(stderr, "=================== %s has been deprecated\n", arg.c_str());
return true;
}
if (arg == "-smf16" || arg == "--split-mode-f16") {
Expand Down Expand Up @@ -2276,7 +2276,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-grt, --graph-reduce-type", "Type for data exchange between GPUs (default: %s)", "f32"});
options.push_back({ "*", "-smgs, --split-mode-graph-scheduling,", "Force Split Mode Graph Scheduling (default: %d)", params.split_mode_graph_scheduling});
options.push_back({ "*", "-sas, --scheduler_async,", "Async evaluation of compute graphs: %d)", params.scheduler_async});
options.push_back({ "*", "-fdn, --fused-delta-net N", "Use fused delta-net when batch size is <= N with recurrent models: %d)", params.fused_delta_net});
options.push_back({ "*", "-vq, --validate-quants", "validate quantized data while loading the model (default: %d)", params.validate_quants});
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
"in conversation mode, this will be used as system prompt\n"
Expand Down Expand Up @@ -3355,7 +3354,6 @@ struct llama_context_params common_context_params_to_llama(const gpt_params & pa
cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
//cparams.split_mode_f16 = params.split_mode_f16;
cparams.scheduler_async = params.scheduler_async;
cparams.fused_delta_net = params.fused_delta_net;
cparams.min_experts = params.min_experts;
cparams.thresh_experts = params.thresh_experts;
cparams.only_active_experts = params.only_active_exps;
Expand Down Expand Up @@ -4366,7 +4364,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
//fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false");
fprintf(stream, "reduce_type: %s # default f16\n", params.reduce_type.c_str());
fprintf(stream, "scheduler_async: %s # default: false\n", params.scheduler_async ? "true" : "false");
fprintf(stream, "fused_delta_net: %d # default: 0\n", params.fused_delta_net );
fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

Expand Down
31 changes: 3 additions & 28 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,6 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -no-fug, --no-fused-up-gate <0|1> (default: %s)\n", cmd_params_defaults.no_fug? "1" : "0");
printf(" -no-ooae, --no-offload-only-active-experts <0|1> (default: %s)\n", cmd_params_defaults.no_ooae? "1" : "0");
printf(" -sas, --scheduler-async <0|1> (default: %s)\n", cmd_params_defaults.sas ? "1" : "0");
printf(" -fdn, --fused-delta-net <n> (default: %d)\n", cmd_params_defaults.fdn);
printf(" --print-overrides <0|1> (default: %s)\n", cmd_params_defaults.print_overrides ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
Expand Down Expand Up @@ -813,12 +812,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.sas = std::stoi(argv[i]);
} else if (arg == "-fdn" || arg == "--fused-delta-net") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.fdn = std::stoi(argv[i]);
} else if (arg == "-rcache" || arg == "--rope-cache") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -965,7 +958,6 @@ struct cmd_params_instance {
bool muge = false;
bool rcache = false;
bool sas = false;
int fdn = 0;
const llama_model_tensor_buft_override* buft_overrides;

llama_model_params to_llama_mparams() const {
Expand Down Expand Up @@ -1001,7 +993,6 @@ struct cmd_params_instance {
muge == other.muge &&
use_thp == other.use_thp &&
sas == other.sas &&
fdn == other.fdn &&
tensor_split == other.tensor_split;
}

Expand All @@ -1028,7 +1019,6 @@ struct cmd_params_instance {
cparams.embeddings = embeddings;
cparams.cuda_params = (void *)cuda_params.data();
cparams.scheduler_async = sas;
cparams.fused_delta_net = fdn;

return cparams;
}
Expand Down Expand Up @@ -1095,7 +1085,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1139,7 +1128,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1183,7 +1171,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1227,7 +1214,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .muge = */ params.muge,
/* .rcache = */ params.rcache,
/* .sas = */ params.sas,
/* .fdn = */ params.fdn,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
Expand Down Expand Up @@ -1282,7 +1268,6 @@ struct test {
bool muge = false;
bool rcache = false;
bool sas = false;
int fdn = 0;
std::string override_tensor;
int n_prompt;
int n_gen;
Expand Down Expand Up @@ -1324,7 +1309,6 @@ struct test {
ger = inst.ger;
rcache = inst.rcache;
sas = inst.sas;
fdn = inst.fdn;
no_fug = inst.no_fug;
use_thp = inst.use_thp;
no_ooae = inst.no_ooae;
Expand Down Expand Up @@ -1429,7 +1413,7 @@ struct test {
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
field == "avg_ns" || field == "stddev_ns" || field == "fdn") {
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
Expand Down Expand Up @@ -1480,7 +1464,7 @@ struct test {
std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser), std::to_string(reuse),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(repack), std::to_string(mqkv), std::to_string(muge), std::to_string(fmoe), std::to_string(ger),
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(rcache), std::to_string(sas), std::to_string(fdn),
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(rcache), std::to_string(sas),
cuda_params, override_tensor,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
Expand All @@ -1501,7 +1485,7 @@ struct test {
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser", "reuse",
"tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "muge", "fused_moe", "grouped_er",
"no_fused_up_gate", "use_thp", "no_ooae", "rcache", "sas", "fdn", "cuda_params", "override_tensor",
"no_fused_up_gate", "use_thp", "no_ooae", "rcache", "sas", "cuda_params", "override_tensor",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts", "test",
Expand Down Expand Up @@ -1691,9 +1675,6 @@ struct markdown_printer : public printer {
if (field == "sas") {
return 3;
}
if (field == "fdn") {
return 4;
}
if (field == "use_thp") {
return 3;
}
Expand Down Expand Up @@ -1767,9 +1748,6 @@ struct markdown_printer : public printer {
if (field == "sas") {
return "sas";
}
if (field == "fdn") {
return "fdn";
}
if (field == "use_thp") {
return "thp";
}
Expand Down Expand Up @@ -1880,9 +1858,6 @@ struct markdown_printer : public printer {
if (params.sas != cmd_params_defaults.sas) {
fields.emplace_back("sas");
}
if (params.fdn != cmd_params_defaults.fdn) {
fields.emplace_back("fdn");
}
if (params.muge != cmd_params_defaults.muge) {
fields.emplace_back("muge");
}
Expand Down
Loading