Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Every model is written from scratch to maximize performance and remove layers of
| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) |
| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) |
| Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905) |
| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743) |
| Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) |
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
Expand Down
20 changes: 20 additions & 0 deletions litgpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,26 @@ def norm_class(self) -> Type:
mlp_class_name="LLaMAMLP",
parallel_residual=False,
),
# https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json
dict(
name="Phi-4-mini-instruct",
hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"),
vocab_size=200019,
padded_vocab_size=200064,
block_size=131072,
n_embd=3072,
n_layer=32,
n_head=24,
n_query_groups=8,
rotary_percentage=0.75,
bias=False,
norm_class_name="RMSNorm",
intermediate_size=8192,
mlp_class_name="LLaMAMLP",
parallel_residual=False,
sliding_window_size=262145,
sliding_window_layer_placing="all",
),
]
configs.extend(phi)

Expand Down
6 changes: 4 additions & 2 deletions litgpt/scripts/convert_hf_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def copy_weights_phi(
"lm_head.bias": "lm_head.bias",
}

if config.name.startswith(("Phi-3", "phi-4")):
if config.name.startswith(("Phi-3", "phi-4", "Phi-4")):
weight_map.update(
{
"model.layers.{}.self_attn.qkv_proj.weight": "transformer.h.{}.attn.qkv.weight",
Expand Down Expand Up @@ -361,6 +361,9 @@ def copy_weights_phi(
if progress_per_file is not None:
pbar.update(progress_per_file)

if "lm_head.weight" not in state_dict and config.name.startswith("Phi-4"):
state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]

for i in list(qkv_weights):
for weight_type in list(qkv_weights[i]):
qkv = qkv_weights[i][weight_type]
Expand Down Expand Up @@ -606,6 +609,5 @@ def convert_hf_checkpoint(
for bin_file in sorted(bin_files):
hf_weights = load_safetensors(bin_file) if bin_file.suffix == ".safetensors" else lazy_load(bin_file)
copy_fn(sd, hf_weights, saver=saver, dtype=dtype, debug_mode=debug_mode)

print(f"Saving converted checkpoint to {checkpoint_dir}")
saver.save(sd)
8 changes: 5 additions & 3 deletions litgpt/scripts/convert_lit_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def copy_weights_phi(
"lm_head.weight": "lm_head.weight",
"lm_head.bias": "lm_head.bias",
}
if config.name.startswith(("Phi-3", "phi-4")):
if config.name.lower().startswith(("phi-3", "phi-4")):
weight_map.update(
{
"transformer.h.{}.attn.qkv.weight": "model.layers.{}.self_attn.qkv_proj.weight",
Expand All @@ -249,10 +249,12 @@ def copy_weights_phi(
gate_up_proj_weights = defaultdict(dict)

for from_name, param in lit_weights.items():
if from_name == "lm_head.weight" and config.name.startswith("Phi-4"):
continue
name_template, layer_idx = layer_template(from_name)
param = load_param(param, from_name, None)
if from_name.endswith((".attn.qkv.weight", ".attn.qkv.bias")):
if config.name.startswith("Phi-3"):
if config.name.lower().startswith(("phi-3", "phi-4")):
to_names = (weight_map[name_template].format(layer_idx),)
params = (param,)
else:
Expand Down Expand Up @@ -282,7 +284,7 @@ def copy_weights_phi(
param = saver.store_early(param)
state_dict[to_name] = param

if config.name.startswith("Phi-3"):
if config.name.lower().startswith(("phi-3", "phi-4")):
for layer_idx in list(gate_up_proj_weights):
fc_1_weight = gate_up_proj_weights[layer_idx]["fc_1"]
fc_2_weight = gate_up_proj_weights[layer_idx]["fc_2"]
Expand Down
3 changes: 2 additions & 1 deletion tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ def test_against_hf_phi(model_name, device, dtype):

@torch.inference_mode()
@pytest.mark.parametrize(
"model_name", ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4")
"model_name",
("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4", "Phi-4-mini-instruct"),
)
@pytest.mark.parametrize(
("device", "dtype"),
Expand Down
2 changes: 2 additions & 0 deletions tutorials/download_model_weights.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) |
| Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
| Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905) |
| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743) |
| Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) |
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
Expand Down Expand Up @@ -170,6 +171,7 @@ microsoft/Phi-3-mini-128k-instruct
microsoft/Phi-3-mini-4k-instruct
microsoft/Phi-3.5-mini-instruct
microsoft/phi-4
microsoft/Phi-4-mini-instruct
mistralai/mathstral-7B-v0.1
mistralai/Mistral-7B-Instruct-v0.1
mistralai/Mistral-7B-Instruct-v0.2
Expand Down