Skip to content

Commit

Permalink
Merge pull request #645 from allenai/shanea/tokenizer-package-data
Browse files Browse the repository at this point in the history
Move tokenizers to new `olmo_data` package.
  • Loading branch information
2015aroras authored Jul 8, 2024
2 parents 1b2658b + 8ddfe79 commit cbc7c25
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added FLOPs logging
- Added configs for OLMo tiny set of models
- Added configuration field `optimizer.record_update_metrics`, which defaults to `False`, but when set to True will trigger AdamW to collect the step size norm and absolute max for each parameter.
- Added `olmo_data`, a package holding data files like tokenizers.
- Added ability to load tokenizers from `olmo_data` package data.

### Changed

Expand Down
12 changes: 11 additions & 1 deletion olmo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from tokenizers import Tokenizer as BaseTokenizer

from olmo_data import get_data_path, is_data_file

from .aliases import PathOrStr
from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection
from .exceptions import OLMoConfigurationError
Expand Down Expand Up @@ -94,7 +96,7 @@ def from_file(cls, filename: PathOrStr, **kwargs) -> Tokenizer:
:param filename: The name of a file containing a tokenizer specification.
:param kwargs: Other key word arguments passed to :class:`Tokenizer`.
"""
base_tokenizer = BaseTokenizer.from_file(filename)
base_tokenizer = BaseTokenizer.from_file(str(filename))
eos_token_id = kwargs.pop("eos_token_id", base_tokenizer.get_vocab_size() - 1)
return cls(base_tokenizer, eos_token_id, **kwargs)

Expand All @@ -117,6 +119,14 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer:
eos_token_id=model_config.eos_token_id,
pad_token_id=model_config.pad_token_id,
)
# Try interpreting the tokenizer identifer as a file within the package
elif is_data_file(tokenizer_config.identifier):
with get_data_path(tokenizer_config.identifier) as tokenizer_path:
tokenizer = cls.from_file(
tokenizer_path,
eos_token_id=model_config.eos_token_id,
pad_token_id=model_config.pad_token_id,
)
else:
tokenizer = cls.from_pretrained(
tokenizer_config.identifier,
Expand Down
1 change: 1 addition & 0 deletions olmo_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .data import *
27 changes: 27 additions & 0 deletions olmo_data/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from contextlib import contextmanager
from pathlib import Path
from typing import Generator

import importlib_resources
from importlib_resources.abc import Traversable


def _get_data_traversable(data_rel_path: str) -> Traversable:
return importlib_resources.files("olmo_data").joinpath(data_rel_path)


def is_data_dir(data_rel_path: str) -> bool:
return _get_data_traversable(data_rel_path).is_dir()


def is_data_file(data_rel_path: str) -> bool:
return _get_data_traversable(data_rel_path).is_file()


@contextmanager
def get_data_path(data_rel_path: str) -> Generator[Path, None, None]:
try:
with importlib_resources.as_file(_get_data_traversable(data_rel_path)) as path:
yield path
finally:
pass
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"packaging",
"cached_path>=1.6.2",
"transformers",
"importlib_resources",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -63,12 +64,13 @@ include-package-data = true

[tool.setuptools.package-data]
olmo = ["py.typed"]
olmo_data = ["**"]

[tool.setuptools.dynamic]
version = { attr = "olmo.version.VERSION" }

[tool.setuptools.packages.find]
include = ["olmo*", "hf_olmo*"]
include = ["olmo*", "hf_olmo*", "olmo_data*"]
exclude = [
"*.tests",
"*.tests.*",
Expand Down
3 changes: 1 addition & 2 deletions scripts/convert_olmo_to_hf_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@

import torch
import yaml
from tokenizers import Tokenizer
from transformers import OlmoConfig, OlmoForCausalLM
from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast

from tokenizers import Tokenizer

"""
Sample usage:
```
Expand Down

0 comments on commit cbc7c25

Please sign in to comment.