Add phi4 mini

jackzhxng · jackzhxng · commit cd88afe63e79 · 2025-02-28T16:04:56.000-08:00
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
@@ -90,7 +90,7 @@ def model_should_run_on_event(model: str, event: str) -> bool:
     We put higher priority and fast models to pull request and rest to push.
     """
     if event == "pull_request":
-        return model in ["mv3", "vit"]
+        return model in ["mv3", "vit", "phi4_mini"]  # TODO: remove
     elif event == "push":
         # These are super slow. Only run it periodically
         return model not in ["dl3", "edsr", "emformer_predict"]
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -100,6 +100,14 @@ test_model() {
       rm "./${MODEL_NAME}.pte"
       return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
   fi
+  if [[ "${MODEL_NAME}" == "phi4_mini" ]]; then
+      # Install requirements for export_llama
+      bash examples/models/llama/install_requirements.sh
+      # Test export_llama script: python3 -m examples.models.llama.export_llama.
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi_4_mini/config.json
+      run_portable_executor_runner
+      rm "./${MODEL_NAME}.pte"
+  fi
 
   # Export a basic .pte and run the model.
   "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}"
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
@@ -35,6 +35,7 @@
     "llava": ("llava", "LlavaModel"),
     "efficient_sam": ("efficient_sam", "EfficientSAM"),
     "qwen2_5": ("qwen2_5", "Qwen2_5Model"),
+    "phi4_mini": ("phi4_mini", "Phi4MiniModel"),
 }
 
 __all__ = [
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -93,6 +93,7 @@
     "llama3_2",
     "static_llama",
     "qwen2_5",
+    "phi4_mini",
 ]
 TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
 
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -134,11 +134,21 @@ def forward(
 
 
 # Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L77
-def hf_precompute_freqs_cis(dim: int, end: int, theta: float):
+# and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
+# Current only support non-long rope.
+def hf_precompute_freqs_cis(
+    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+):
+    # Partial rotary embeddings.
+    dim = int(dim * partial_rotary_factor)
+
+    # Short factor scaling.
     freqs = 1.0 / (
         theta
         ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
     )
+    # TODO: support long factor scaling.
+
     # pyre-ignore Undefined attribute [16]: `float` has no attribute `device`.
     t = torch.arange(end, device=freqs.device, dtype=torch.int64).type_as(
         freqs  # pyre-ignore
diff --git a/examples/models/phi-4-mini/config.json b/examples/models/phi-4-mini/config.json
@@ -0,0 +1,14 @@
+{
+  "dim": 3072,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 8192,
+  "n_heads": 24,
+  "n_kv_heads": 8,
+  "n_layers": 32,
+  "norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 200064,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false
+}
diff --git a/examples/models/phi-4-mini/convert_weights.py b/examples/models/phi-4-mini/convert_weights.py
@@ -0,0 +1,85 @@
+import argparse
+from typing import Dict
+
+import torch
+
+from torchtune.models.convert_weights import get_mapped_key
+
+from torchtune.training import FullModelHFCheckpointer
+
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune.
+_PHI_4_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+}
+
+
+def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _PHI_4_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    # Input and output embeddings are tied.
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+
+    return converted_state_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Phi-4-mini weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+
+    checkpointer = FullModelHFCheckpointer(
+        checkpoint_dir=args.input_dir,
+        checkpoint_files=["model-00001-of-00003.safetensors", "model-00002-of-00003.safetensors", "model-00003-of-00003.safetensors"],
+        output_dir=".",
+        model_type="PHI4_MINI",
+    )
+
+    print("Loading checkpoint...")
+    sd = checkpointer.load_checkpoint()
+
+    print("Converting checkpoint...")
+    sd = phi_4_tune_to_meta(sd["model"])
+
+    torch.save(sd, args.output)
+    print(f"Checkpoint saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/phi-4-mini/hf_test.py b/examples/models/phi-4-mini/hf_test.py
@@ -0,0 +1,20 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+set_seed(2024)  
+
+prompt = "Tell me a story."
+
+model_checkpoint = "microsoft/Phi-4-multimodal-instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_checkpoint,
+                                             trust_remote_code=True,
+                                             torch_dtype="auto",
+                                             device_map="cpu")
+
+inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
+outputs = model.generate(**inputs, do_sample=True, max_new_tokens=120)
+response= tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+print(response)

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@`
`35`	`35`	`"llava": ("llava", "LlavaModel"),`
`36`	`36`	`"efficient_sam": ("efficient_sam", "EfficientSAM"),`
`37`	`37`	`"qwen2_5": ("qwen2_5", "Qwen2_5Model"),`
	`38`	`+ "phi4_mini": ("phi4_mini", "Phi4MiniModel"),`
`38`	`39`	`}`
`39`	`40`
`40`	`41`	`__all__ = [`
Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@`
`93`	`93`	`"llama3_2",`
`94`	`94`	`"static_llama",`
`95`	`95`	`"qwen2_5",`
	`96`	`+ "phi4_mini",`
`96`	`97`	`]`
`97`	`98`	`TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]`
`98`	`99`