Skip to content

Commit cbc7c25

Browse files
authored
Merge pull request #645 from allenai/shanea/tokenizer-package-data
Move tokenizers to new `olmo_data` package.
2 parents 1b2658b + 8ddfe79 commit cbc7c25

8 files changed

+45
-4
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1717
- Added FLOPs logging
1818
- Added configs for OLMo tiny set of models
1919
- Added configuration field `optimizer.record_update_metrics`, which defaults to `False`, but when set to True will trigger AdamW to collect the step size norm and absolute max for each parameter.
20+
- Added `olmo_data`, a package holding data files like tokenizers.
21+
- Added ability to load tokenizers from `olmo_data` package data.
2022

2123
### Changed
2224

olmo/tokenizer.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
from tokenizers import Tokenizer as BaseTokenizer
88

9+
from olmo_data import get_data_path, is_data_file
10+
911
from .aliases import PathOrStr
1012
from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection
1113
from .exceptions import OLMoConfigurationError
@@ -94,7 +96,7 @@ def from_file(cls, filename: PathOrStr, **kwargs) -> Tokenizer:
9496
:param filename: The name of a file containing a tokenizer specification.
9597
:param kwargs: Other key word arguments passed to :class:`Tokenizer`.
9698
"""
97-
base_tokenizer = BaseTokenizer.from_file(filename)
99+
base_tokenizer = BaseTokenizer.from_file(str(filename))
98100
eos_token_id = kwargs.pop("eos_token_id", base_tokenizer.get_vocab_size() - 1)
99101
return cls(base_tokenizer, eos_token_id, **kwargs)
100102

@@ -117,6 +119,14 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer:
117119
eos_token_id=model_config.eos_token_id,
118120
pad_token_id=model_config.pad_token_id,
119121
)
122+
# Try interpreting the tokenizer identifer as a file within the package
123+
elif is_data_file(tokenizer_config.identifier):
124+
with get_data_path(tokenizer_config.identifier) as tokenizer_path:
125+
tokenizer = cls.from_file(
126+
tokenizer_path,
127+
eos_token_id=model_config.eos_token_id,
128+
pad_token_id=model_config.pad_token_id,
129+
)
120130
else:
121131
tokenizer = cls.from_pretrained(
122132
tokenizer_config.identifier,

olmo_data/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .data import *

olmo_data/data.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from contextlib import contextmanager
2+
from pathlib import Path
3+
from typing import Generator
4+
5+
import importlib_resources
6+
from importlib_resources.abc import Traversable
7+
8+
9+
def _get_data_traversable(data_rel_path: str) -> Traversable:
10+
return importlib_resources.files("olmo_data").joinpath(data_rel_path)
11+
12+
13+
def is_data_dir(data_rel_path: str) -> bool:
14+
return _get_data_traversable(data_rel_path).is_dir()
15+
16+
17+
def is_data_file(data_rel_path: str) -> bool:
18+
return _get_data_traversable(data_rel_path).is_file()
19+
20+
21+
@contextmanager
22+
def get_data_path(data_rel_path: str) -> Generator[Path, None, None]:
23+
try:
24+
with importlib_resources.as_file(_get_data_traversable(data_rel_path)) as path:
25+
yield path
26+
finally:
27+
pass

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ dependencies = [
2424
"packaging",
2525
"cached_path>=1.6.2",
2626
"transformers",
27+
"importlib_resources",
2728
]
2829

2930
[project.optional-dependencies]
@@ -63,12 +64,13 @@ include-package-data = true
6364

6465
[tool.setuptools.package-data]
6566
olmo = ["py.typed"]
67+
olmo_data = ["**"]
6668

6769
[tool.setuptools.dynamic]
6870
version = { attr = "olmo.version.VERSION" }
6971

7072
[tool.setuptools.packages.find]
71-
include = ["olmo*", "hf_olmo*"]
73+
include = ["olmo*", "hf_olmo*", "olmo_data*"]
7274
exclude = [
7375
"*.tests",
7476
"*.tests.*",

scripts/convert_olmo_to_hf_new.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,10 @@
2121

2222
import torch
2323
import yaml
24+
from tokenizers import Tokenizer
2425
from transformers import OlmoConfig, OlmoForCausalLM
2526
from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
2627

27-
from tokenizers import Tokenizer
28-
2928
"""
3029
Sample usage:
3130
```

0 commit comments

Comments
 (0)