diff --git a/README.md b/README.md index d2e0494443..7d208e8cf0 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,7 @@ Every model is written from scratch to maximize performance and remove layers of | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905) | +| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743) | | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | diff --git a/litgpt/config.py b/litgpt/config.py index f36bb11a7a..f1384830e1 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1601,6 +1601,26 @@ def norm_class(self) -> Type: mlp_class_name="LLaMAMLP", parallel_residual=False, ), + # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json + dict( + name="Phi-4-mini-instruct", + hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"), + vocab_size=200019, + padded_vocab_size=200064, + block_size=131072, + n_embd=3072, + n_layer=32, + n_head=24, + n_query_groups=8, + rotary_percentage=0.75, + bias=False, + norm_class_name="RMSNorm", + intermediate_size=8192, + mlp_class_name="LLaMAMLP", + parallel_residual=False, + sliding_window_size=262145, + sliding_window_layer_placing="all", + ), ] configs.extend(phi) diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py index 0261807a79..c6dc110f20 100644 --- a/litgpt/scripts/convert_hf_checkpoint.py +++ b/litgpt/scripts/convert_hf_checkpoint.py @@ -323,7 +323,7 @@ def copy_weights_phi( "lm_head.bias": "lm_head.bias", } - if config.name.startswith(("Phi-3", "phi-4")): + if config.name.startswith(("Phi-3", "phi-4", "Phi-4")): weight_map.update( { "model.layers.{}.self_attn.qkv_proj.weight": "transformer.h.{}.attn.qkv.weight", @@ -361,6 +361,9 @@ def copy_weights_phi( if progress_per_file is not None: pbar.update(progress_per_file) + if "lm_head.weight" not in state_dict and config.name.startswith("Phi-4"): + state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"] + for i in list(qkv_weights): for weight_type in list(qkv_weights[i]): qkv = qkv_weights[i][weight_type] @@ -606,6 +609,5 @@ def convert_hf_checkpoint( for bin_file in sorted(bin_files): hf_weights = load_safetensors(bin_file) if bin_file.suffix == ".safetensors" else lazy_load(bin_file) copy_fn(sd, hf_weights, saver=saver, dtype=dtype, debug_mode=debug_mode) - print(f"Saving converted checkpoint to {checkpoint_dir}") saver.save(sd) diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py index 934d87fa6d..75c6934f56 100644 --- a/litgpt/scripts/convert_lit_checkpoint.py +++ b/litgpt/scripts/convert_lit_checkpoint.py @@ -236,7 +236,7 @@ def copy_weights_phi( "lm_head.weight": "lm_head.weight", "lm_head.bias": "lm_head.bias", } - if config.name.startswith(("Phi-3", "phi-4")): + if config.name.lower().startswith(("phi-3", "phi-4")): weight_map.update( { "transformer.h.{}.attn.qkv.weight": "model.layers.{}.self_attn.qkv_proj.weight", @@ -249,10 +249,12 @@ def copy_weights_phi( gate_up_proj_weights = defaultdict(dict) for from_name, param in lit_weights.items(): + if from_name == "lm_head.weight" and config.name.startswith("Phi-4"): + continue name_template, layer_idx = layer_template(from_name) param = load_param(param, from_name, None) if from_name.endswith((".attn.qkv.weight", ".attn.qkv.bias")): - if config.name.startswith("Phi-3"): + if config.name.lower().startswith(("phi-3", "phi-4")): to_names = (weight_map[name_template].format(layer_idx),) params = (param,) else: @@ -282,7 +284,7 @@ def copy_weights_phi( param = saver.store_early(param) state_dict[to_name] = param - if config.name.startswith("Phi-3"): + if config.name.lower().startswith(("phi-3", "phi-4")): for layer_idx in list(gate_up_proj_weights): fc_1_weight = gate_up_proj_weights[layer_idx]["fc_1"] fc_2_weight = gate_up_proj_weights[layer_idx]["fc_2"] diff --git a/tests/test_model.py b/tests/test_model.py index d0babac8f8..85ed72c516 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -332,7 +332,8 @@ def test_against_hf_phi(model_name, device, dtype): @torch.inference_mode() @pytest.mark.parametrize( - "model_name", ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4") + "model_name", + ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4", "Phi-4-mini-instruct"), ) @pytest.mark.parametrize( ("device", "dtype"), diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 40335d949a..9b7011804e 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -35,6 +35,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) | | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905) | +| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743) | | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | @@ -170,6 +171,7 @@ microsoft/Phi-3-mini-128k-instruct microsoft/Phi-3-mini-4k-instruct microsoft/Phi-3.5-mini-instruct microsoft/phi-4 +microsoft/Phi-4-mini-instruct mistralai/mathstral-7B-v0.1 mistralai/Mistral-7B-Instruct-v0.1 mistralai/Mistral-7B-Instruct-v0.2