Skip to content

Commit

Permalink
Merge pull request #611 from allenai/shanea/hf-get-tokenizer-from-con…
Browse files Browse the repository at this point in the history
…fig-2

Read and use tokenizer identifier from config
  • Loading branch information
2015aroras authored Jun 10, 2024
2 parents de43ee8 + c02f1ca commit 578234d
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion scripts/convert_olmo_to_hf_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,14 @@ def _write_tokenizer(
if input_tokenizer_path is not None:
base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
else:
base_tokenizer = Tokenizer.from_pretrained(checkpoint_dir)
config_path = Path(checkpoint_dir) / "config.yaml"
tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]

# Initialize tokenizer and validate vocab size.
if Path(tokenizer_config["identifier"]).is_file():
base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
else:
base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])

eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
Expand Down

0 comments on commit 578234d

Please sign in to comment.