Merge pull request #611 from allenai/shanea/hf-get-tokenizer-from-con…

…fig-2 Read and use tokenizer identifier from config
allenai · Jun 10, 2024 · 578234d · 578234d
2 parents de43ee8 + c02f1ca
commit 578234d
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py
@@ -203,7 +203,14 @@ def _write_tokenizer(
     if input_tokenizer_path is not None:
         base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
     else:
-        base_tokenizer = Tokenizer.from_pretrained(checkpoint_dir)
+        config_path = Path(checkpoint_dir) / "config.yaml"
+        tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]
+
+        # Initialize tokenizer and validate vocab size.
+        if Path(tokenizer_config["identifier"]).is_file():
+            base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
+        else:
+            base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])
 
     eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
     pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id