[Llama3] Support Llama3 download from Hugging Face (pytorch#323)

orionr · malfet · commit 03a0962eef78 · 2024-07-17T09:55:42.000-07:00
diff --git a/build/convert_hf_checkpoint.py b/build/convert_hf_checkpoint.py
@@ -38,7 +38,26 @@ def convert_hf_checkpoint(
     # Load the json file containing weight mapping
     model_map_json = model_dir / "pytorch_model.bin.index.json"
 
-    assert model_map_json.is_file()
+    # If there is no weight mapping, check for a consolidated model and
+    # tokenizer we can move. Llama 2 and Mistral have weight mappings, while
+    # Llama 3 has a consolidated model and tokenizer.
+    # Otherwise raise an error.
+    if not model_map_json.is_file():
+        consolidated_pth = model_dir / "original" / "consolidated.00.pth"
+        tokenizer_pth = model_dir / "original" / "tokenizer.model"
+        if consolidated_pth.is_file() and tokenizer_pth.is_file():
+            # Confirm we can load it
+            loaded_result = torch.load(
+                str(consolidated_pth), map_location="cpu", mmap=True, weights_only=True
+            )
+            del loaded_result  # No longer needed
+            print(f"Moving checkpoint to {model_dir / 'model.pth'}.")
+            os.rename(consolidated_pth, model_dir / "model.pth")
+            os.rename(tokenizer_pth, model_dir / "tokenizer.model")
+            print("Done.")
+            return
+        else:
+            raise RuntimeError(f"Could not find {model_map_json} or {consolidated_pth} plus {tokenizer_pth}")
 
     with open(model_map_json) as json_map:
         bin_index = json.load(json_map)
@@ -111,7 +130,7 @@ def permute(w, n_heads):
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(description="Convert HuggingFace checkpoint.")
+    parser = argparse.ArgumentParser(description="Convert Hugging Face checkpoint.")
     parser.add_argument(
         "--checkpoint-dir",
         type=Path,
diff --git a/build/model.py b/build/model.py
@@ -42,7 +42,7 @@ class ModelArgs:
     multiple_of: int = 256
     ffn_dim_multiplier: Optional[int] = None
     use_tiktoken: Optional[bool] = None
-    
+
     def __post_init__(self):
         if self.n_local_heads == -1:
             self.n_local_heads = self.n_heads
@@ -60,7 +60,7 @@ def __post_init__(self):
         if isinstance(self.use_tiktoken, str):
             self.use_tiktoken = (self.use_tiktoken == "True")
 
-            
+
     @classmethod
     def from_params(cls, params_path):
         replace = [("rope_theta", "rope_base"), ("n_kv_heads", "n_local_heads")]
@@ -85,19 +85,19 @@ def from_table(cls, name: str):
 
     @classmethod
     def from_name(cls, name: str):
-        print(f"name {name}")
+        print(f"Name {name}")
         json_path=f"{config_dir}/{name}.json"
         if Path(json_path).is_file():
             return ModelArgs.from_params(json_path)
 
         known_model_params = [config.replace(".json", "") for config in os.listdir(config_dir)]
 
-        print(f"known configs: {known_model_params}")
-        # fuzzy search
+        # Fuzzy search by name (e.g. "7B" and "Mistral-7B")
+        print(f"Known configs: {known_model_params}")
         config = [
             config
             for config in known_model_params
-            if config.replace in str(name).upper() or config in str(name)
+            if config in str(name).upper() or config in str(name)
         ]
 
         # We may have two or more configs matched (e.g. "7B" and "Mistral-7B"). Find the best config match,
diff --git a/config/data/models.json b/config/data/models.json
@@ -1,11 +1,16 @@
 {
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "aliases": ["llama3", "llama3-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "meta-llama/Meta-Llama-3-8B-Instruct"
+    },
     "meta-llama/Llama-2-7b-chat-hf": {
         "aliases": ["llama2", "llama2-7b"],
         "distribution_channel": "HuggingFaceSnapshot",
         "distribution_path": "meta-llama/Llama-2-7b-chat-hf"
     },
     "mistralai/Mistral-7B-Instruct-v0.2": {
-        "aliases": ["mistral-7b-instruct"],
+        "aliases": ["mistral-7b", "mistral-7b-instruct"],
         "distribution_channel": "HuggingFaceSnapshot",
         "distribution_path": "mistralai/Mistral-7B-Instruct-v0.2"
     },
diff --git a/download.py b/download.py
@@ -26,7 +26,7 @@ def _download_and_convert_hf_snapshot(
     from huggingface_hub import snapshot_download
 
     # Download and store the HF model artifacts.
-    print(f"Downloading {model} from HuggingFace...")
+    print(f"Downloading {model} from Hugging Face...")
     try:
         snapshot_download(
             model,