diff --git a/CHANGELOG.md b/CHANGELOG.md index f5fcaa11b..b013e863c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Don't log garbage on nodes that aren't rank 0 +- Don't crash in the HF code when we are referring to a tokenizer in a local file ## [v0.2.5](https://github.com/allenai/OLMo/releases/tag/v0.2.5) - 2024-03-06 diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py index 3ed064097..625ce45e9 100644 --- a/olmo/tokenizer.py +++ b/olmo/tokenizer.py @@ -111,11 +111,18 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer: model_config = ModelConfig.load(config_path, key="model") # Initialize tokenizer and validate vocab size. - tokenizer = cls.from_pretrained( - tokenizer_config.identifier, - eos_token_id=model_config.eos_token_id, - pad_token_id=model_config.pad_token_id, - ) + if Path(tokenizer_config.identifier).is_file(): + tokenizer = cls.from_file( + tokenizer_config.identifier, + eos_token_id=model_config.eos_token_id, + pad_token_id=model_config.pad_token_id, + ) + else: + tokenizer = cls.from_pretrained( + tokenizer_config.identifier, + eos_token_id=model_config.eos_token_id, + pad_token_id=model_config.pad_token_id, + ) if model_config.vocab_size != tokenizer.vocab_size: raise OLMoConfigurationError("vocab size mismatch between config and tokenizer") return tokenizer