Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions examples/gguf-split/gguf-split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <fstream>
#include <string>
#include <vector>
#include <filesystem>

#include <stdio.h>
#include <string.h>
Expand Down Expand Up @@ -190,6 +191,18 @@ static void zeros(std::ofstream & file, size_t n) {
}
}

static void ensure_output_directory(const std::string & filepath) {
std::filesystem::path p(filepath);
if (p.has_parent_path()) {
std::error_code ec;
std::filesystem::create_directories(p.parent_path(), ec);
if (ec) {
fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
exit(EXIT_FAILURE);
}
}
}

struct split_strategy {
const split_params params;
std::ifstream & f_input;
Expand Down Expand Up @@ -310,6 +323,8 @@ struct split_strategy {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);

ensure_output_directory(split_path);

// open the output file
printf("Writing file %s ... ", split_path);
fflush(stdout);
Expand Down Expand Up @@ -401,6 +416,8 @@ static void gguf_merge(const split_params & split_params) {
int n_split = 1;
int total_tensors = 0;

ensure_output_directory(split_params.output);

// avoid overwriting existing output file
if (std::ifstream(split_params.output.c_str())) {
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
Expand Down
5 changes: 4 additions & 1 deletion examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
Expand All @@ -175,6 +175,7 @@ static void usage(const char * executable) {
printf(" --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
printf(" --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
Expand Down Expand Up @@ -466,6 +467,8 @@ int main(int argc, char ** argv) {
}
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
params.keep_split = true;
} else if (strcmp(argv[arg_idx], "--partial-requant") == 0) {
params.partial_requant = true;
} else {
usage(argv[0]);
}
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ extern "C" {
bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
bool only_repack; // Only repack tensors
bool dry_run; //
bool partial_requant; // quantize only missing split files in the split quantized .gguf destination directory
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * custom_quants; // pointer to vector containing custom quantization rules
Expand Down
47 changes: 46 additions & 1 deletion src/llama-quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <regex>
#include <mutex>
#include <fstream>
#include <filesystem>

//
// quantization
Expand Down Expand Up @@ -39,6 +40,18 @@ static void zeros(std::ofstream & file, size_t n) {
}
}

static void ensure_output_directory(const std::string & filepath) {
std::filesystem::path p(filepath);
if (p.has_parent_path()) {
std::error_code ec;
std::filesystem::create_directories(p.parent_path(), ec);
if (ec) {
fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
exit(EXIT_FAILURE);
}
}
}

struct quantize_state_internal {
const llama_model & model;
const llama_model_quantize_params * params;
Expand Down Expand Up @@ -1039,8 +1052,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}

const size_t align = GGUF_DEFAULT_ALIGNMENT;

ensure_output_directory(fname_out);

struct gguf_context * ctx_out = gguf_init_empty();

// Early exit if partial_requant is enabled and output file already exists
if (params->partial_requant && !params->keep_split) {
std::ifstream test_file(fname_out);
if (test_file) {
LLAMA_LOG_INFO("%s: output file %s exists, skipping\n", __func__, fname_out.c_str());
gguf_free(ctx_out);
return;
}
}

// copy the KV pairs from the input file
gguf_set_kv (ctx_out, ml.meta);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
Expand Down Expand Up @@ -1179,6 +1205,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

int cur_split = -1;
std::ofstream fout;
std::vector<bool> split_skipped(n_split, false);
auto close_ofstream = [&]() {
// Write metadata and close file handler
if (fout.is_open()) {
Expand All @@ -1202,6 +1229,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
fname = std::string(split_path);
}

if (params->partial_requant) {
std::ifstream test_file(fname);
if (test_file) {
LLAMA_LOG_INFO("%s: split file %s exists, skipping\n", __func__, fname.c_str());
split_skipped[cur_split] = true;
fout = std::ofstream();
return;
}
}

ensure_output_directory(fname);
fout = std::ofstream(fname, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
Expand All @@ -1219,6 +1257,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_ofstream(weight->idx);
}

if (params->partial_requant && split_skipped[cur_split]) {
const std::string name = ggml_get_name(tensor);
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type);
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, ggml_nbytes(tensor));
continue;
}

const std::string name = ggml_get_name(tensor);

if (!ml.use_mmap) {
Expand Down Expand Up @@ -1511,7 +1556,7 @@ QuantizationDone:;
total_size_org += ggml_nbytes(tensor);
total_size_new += new_size;

if (!params->dry_run) {
if (!params->dry_run && !split_skipped[cur_split]) {
// update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
Expand Down
1 change: 1 addition & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4414,6 +4414,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.ignore_imatrix_rules =*/ false,
/*.only_repack =*/ false,
/*.dry_run =*/ false,
/*.partial_requant =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.custom_quants =*/ nullptr,
Expand Down