Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/include/gguf.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ extern "C" {
};

GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);

Expand Down Expand Up @@ -189,6 +190,7 @@ extern "C" {
//

// write the entire context to a binary file
GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);

// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,

// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
#endif // __cplusplus
33 changes: 23 additions & 10 deletions ggml/src/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
return true;
}

struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
if (!file) {
return nullptr;
}

const struct gguf_reader gr(file);
struct gguf_context * ctx = new gguf_context;

Expand Down Expand Up @@ -848,7 +852,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return nullptr;
}

struct gguf_context * result = gguf_init_from_file_impl(file, params);
struct gguf_context * result = gguf_init_from_file_ptr(file, params);
fclose(file);
return result;
}
Expand Down Expand Up @@ -1508,6 +1512,19 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
gguf_write_out(ctx, gw, only_meta);
}

bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) {
GGML_ASSERT(file);

try {
gguf_writer_file gw(file);
gguf_write_out(ctx, gw, only_meta);
} catch (const std::runtime_error& ex) {
GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what());
return false;
}
return true;
}

bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = ggml_fopen(fname, "wb");

Expand All @@ -1516,17 +1533,13 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
return false;
}

try {
gguf_writer_file gw(file);
gguf_write_out(ctx, gw, only_meta);
} catch (const std::runtime_error& ex) {
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what());
fclose(file);
return false;
const bool success = gguf_write_to_file_ptr(ctx, file, only_meta);
if (!success) {
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname);
}

fclose(file);
return true;
return success;
}

size_t gguf_get_meta_size(const struct gguf_context * ctx) {
Expand Down
5 changes: 5 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,11 @@ extern "C" {
const char * path_model,
struct llama_model_params params);

// Load a model from an open FILE pointer
LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
FILE * file,
struct llama_model_params params);

// Load a model from multiple splits (support custom naming scheme)
// The paths must be in the correct order
LLAMA_API struct llama_model * llama_model_load_from_splits(
Expand Down
61 changes: 16 additions & 45 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,10 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
case LLM_ARCH_CLIP:
return {};
case LLM_ARCH_LLAMA:
case LLM_ARCH_REFACT:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_DECI:
case LLM_ARCH_MISTRAL3:
case LLM_ARCH_LLAMA_EMBED:
Expand Down Expand Up @@ -744,11 +748,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
case LLM_ARCH_REFACT:
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2VL:
case LLM_ARCH_INTERNLM2:
case LLM_ARCH_GRANITE:
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_PADDLEOCR:
case LLM_ARCH_SMOLLM3:
Expand All @@ -759,6 +761,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
Expand Down Expand Up @@ -1232,29 +1235,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
case LLM_ARCH_MINICPM:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ROPE_FACTORS_LONG,
LLM_TENSOR_ROPE_FACTORS_SHORT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_GATE_EXP,
LLM_TENSOR_FFN_DOWN_EXP,
LLM_TENSOR_FFN_UP_EXP,
};
case LLM_ARCH_MINICPM3:
return {
LLM_TENSOR_TOKEN_EMBD,
Expand Down Expand Up @@ -1442,6 +1422,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
Expand Down Expand Up @@ -1657,7 +1638,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
Expand Down Expand Up @@ -2061,30 +2044,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
case LLM_ARCH_GRANITE_MOE:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
};
case LLM_ARCH_GRANITE_HYBRID:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_SSM_IN,
LLM_TENSOR_SSM_CONV1D,
Expand Down Expand Up @@ -2412,6 +2377,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
Expand Down Expand Up @@ -2789,7 +2755,12 @@ std::string LLM_TN_IMPL::str() const {
}

if (model_tensors.find(tensor) == model_tensors.end()) {
return LLM_TENSOR_NAMES.at(tensor);
const char * name = LLM_TENSOR_NAMES.at(tensor);
if (suffix != nullptr || bid != -1 || xid != -1) {
LLAMA_LOG_WARN("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n",
__func__, name, suffix, bid, xid);
}
return name;
}

std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);
Expand Down
23 changes: 21 additions & 2 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ struct llama_file::impl {
seek(0, SEEK_SET);
}

impl(FILE * file) : owns_fp(false) {
fp = file;
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}

size_t tell() const {
LARGE_INTEGER li;
li.QuadPart = 0;
Expand Down Expand Up @@ -159,7 +167,7 @@ struct llama_file::impl {
}

~impl() {
if (fp) {
if (fp && owns_fp) {
std::fclose(fp);
}
}
Expand Down Expand Up @@ -209,6 +217,13 @@ struct llama_file::impl {
seek(0, SEEK_SET);
}

impl(FILE * file) : fname("(file*)"), owns_fp(false) {
fp = file;
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}

size_t tell() const {
if (fd == -1) {
long ret = std::ftell(fp);
Expand Down Expand Up @@ -353,7 +368,7 @@ struct llama_file::impl {
~impl() {
if (fd != -1) {
close(fd);
} else {
} else if (owns_fp) {
std::fclose(fp);
}
}
Expand All @@ -369,10 +384,14 @@ struct llama_file::impl {

FILE * fp{};
size_t size{};
bool owns_fp = true;
};

llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}

llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}

llama_file::~llama_file() = default;

size_t llama_file::tell() const { return pimpl->tell(); }
Expand Down
1 change: 1 addition & 0 deletions src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

struct llama_file {
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
llama_file(FILE * file);
~llama_file();

size_t tell() const;
Expand Down
33 changes: 32 additions & 1 deletion src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ llama_model_loader::llama_model_loader(
void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits,
FILE * file,
bool use_mmap,
bool use_direct_io,
bool check_tensors,
Expand Down Expand Up @@ -658,6 +659,36 @@ llama_model_loader::llama_model_loader(

LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
}
} else if (file != nullptr) {
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};

metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
metadata = metadata_ptr.get();
if (metadata == nullptr) {
throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
}

get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));

files.emplace_back(new llama_file(file));
contexts.emplace_back(ctx);

// Save tensors data offset info of the main file.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
}
} else {
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
Expand All @@ -669,7 +700,7 @@ llama_model_loader::llama_model_loader(
fver = (enum llama_fver) gguf_get_version(metadata);

LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
__func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));

// determine file type based on the number of tensors for each quantization and print meta data
// TODO: make optional
Expand Down
1 change: 1 addition & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ struct llama_model_loader {
void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
FILE * file,
bool use_mmap,
bool use_direct_io,
bool check_tensors,
Expand Down
Loading
Loading