Skip to content

Commit 2a49a68

Browse files
committed
Merge branch 'master' into compilade/faster-lazy-safetensors
2 parents 7cda4dd + 97bdd26 commit 2a49a68

25 files changed

+1514
-703
lines changed

common/common.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -685,15 +685,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
685685
if (arg == "--lora") {
686686
CHECK_ARG
687687
params.lora_adapter.emplace_back(argv[i], 1.0f);
688-
params.use_mmap = false;
689688
return true;
690689
}
691690
if (arg == "--lora-scaled") {
692691
CHECK_ARG
693692
const char* lora_adapter = argv[i];
694693
CHECK_ARG
695694
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
696-
params.use_mmap = false;
697695
return true;
698696
}
699697
if (arg == "--lora-base") {
@@ -797,6 +795,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
797795
params.cont_batching = true;
798796
return true;
799797
}
798+
if (arg == "-nocb" || arg == "--no-cont-batching") {
799+
params.cont_batching = false;
800+
return true;
801+
}
800802
if (arg == "-fa" || arg == "--flash-attn") {
801803
params.flash_attn = true;
802804
return true;
@@ -1538,6 +1540,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
15381540
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
15391541
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
15401542
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
1543+
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
15411544

15421545
options.push_back({ "multi-modality" });
15431546
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@@ -2084,19 +2087,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20842087
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
20852088
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
20862089
float lora_scale = std::get<1>(params.lora_adapter[i]);
2087-
int err = llama_model_apply_lora_from_file(model,
2088-
lora_adapter.c_str(),
2089-
lora_scale,
2090-
((i > 0) || params.lora_base.empty())
2091-
? NULL
2092-
: params.lora_base.c_str(),
2093-
params.n_threads);
2094-
if (err != 0) {
2090+
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2091+
if (adapter == nullptr) {
20952092
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
20962093
llama_free(lctx);
20972094
llama_free_model(model);
20982095
return std::make_tuple(nullptr, nullptr);
20992096
}
2097+
llama_lora_adapter_set(lctx, adapter, lora_scale);
21002098
}
21012099

21022100
if (params.ignore_eos) {

convert_hf_to_gguf.py

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2271,13 +2271,6 @@ def set_vocab(self):
22712271

22722272
special_vocab.add_to_gguf(self.gguf_writer)
22732273

2274-
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
2275-
if n_head_kv is not None and n_head != n_head_kv:
2276-
n_head = n_head_kv
2277-
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2278-
.swapaxes(1, 2)
2279-
.reshape(weights.shape))
2280-
22812274
def set_gguf_parameters(self):
22822275
self.gguf_writer.add_name("InternLM2")
22832276
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
@@ -2297,26 +2290,22 @@ def set_gguf_parameters(self):
22972290
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
22982291
num_heads = self.hparams["num_attention_heads"]
22992292
num_kv_heads = self.hparams["num_key_value_heads"]
2300-
hidden_size = self.hparams["hidden_size"]
2293+
n_embd = self.hparams["hidden_size"]
23012294
q_per_kv = num_heads // num_kv_heads
2302-
head_dim = hidden_size // num_heads
2295+
head_dim = n_embd // num_heads
23032296
num_groups = num_heads // q_per_kv
23042297

2305-
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2306-
2307-
if re.match(qkv_pattern, name):
2308-
bid = re.findall(qkv_pattern, name)[0]
2298+
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
23092299
qkv = data_torch
2310-
# qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2311-
qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
2312-
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2300+
2301+
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
2302+
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
2303+
23132304
# The model weights of q and k equire additional reshape.
2314-
# q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2315-
q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
2316-
# k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2317-
k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
2318-
# v = rearrange(v, " o g n i -> o (g n i)").T
2319-
v = v.reshape((v.shape[0], -1)).T
2305+
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
2306+
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
2307+
v = v.reshape((-1, v.shape[-1]))
2308+
23202309
return [
23212310
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
23222311
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
@@ -3620,6 +3609,7 @@ def main() -> None:
36203609
small_first_shard=args.no_tensor_first_split)
36213610

36223611
logger.info("Set model parameters")
3612+
model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL)
36233613
model_instance.set_gguf_parameters()
36243614

36253615
logger.info("Set model tokenizer")

0 commit comments

Comments
 (0)