Merge pull request #645 from allenai/shanea/tokenizer-package-data

2015aroras · web-flow · commit cbc7c25ba796 · 2024-07-08T16:21:59.000-07:00
Move tokenizers to new `olmo_data` package.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added FLOPs logging
 - Added configs for OLMo tiny set of models
 - Added configuration field `optimizer.record_update_metrics`, which defaults to `False`, but when set to True will trigger AdamW to collect the step size norm and absolute max for each parameter.
+- Added `olmo_data`, a package holding data files like tokenizers.
+- Added ability to load tokenizers from `olmo_data` package data.
 
 ### Changed
 
diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py
@@ -6,6 +6,8 @@
 
 from tokenizers import Tokenizer as BaseTokenizer
 
+from olmo_data import get_data_path, is_data_file
+
 from .aliases import PathOrStr
 from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection
 from .exceptions import OLMoConfigurationError
@@ -94,7 +96,7 @@ def from_file(cls, filename: PathOrStr, **kwargs) -> Tokenizer:
         :param filename: The name of a file containing a tokenizer specification.
         :param kwargs: Other key word arguments passed to :class:`Tokenizer`.
         """
-        base_tokenizer = BaseTokenizer.from_file(filename)
+        base_tokenizer = BaseTokenizer.from_file(str(filename))
         eos_token_id = kwargs.pop("eos_token_id", base_tokenizer.get_vocab_size() - 1)
         return cls(base_tokenizer, eos_token_id, **kwargs)
 
@@ -117,6 +119,14 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer:
                 eos_token_id=model_config.eos_token_id,
                 pad_token_id=model_config.pad_token_id,
             )
+        # Try interpreting the tokenizer identifer as a file within the package
+        elif is_data_file(tokenizer_config.identifier):
+            with get_data_path(tokenizer_config.identifier) as tokenizer_path:
+                tokenizer = cls.from_file(
+                    tokenizer_path,
+                    eos_token_id=model_config.eos_token_id,
+                    pad_token_id=model_config.pad_token_id,
+                )
         else:
             tokenizer = cls.from_pretrained(
                 tokenizer_config.identifier,
diff --git a/olmo_data/__init__.py b/olmo_data/__init__.py
@@ -0,0 +1 @@
+from .data import *
diff --git a/olmo_data/data.py b/olmo_data/data.py
@@ -0,0 +1,27 @@
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator
+
+import importlib_resources
+from importlib_resources.abc import Traversable
+
+
+def _get_data_traversable(data_rel_path: str) -> Traversable:
+    return importlib_resources.files("olmo_data").joinpath(data_rel_path)
+
+
+def is_data_dir(data_rel_path: str) -> bool:
+    return _get_data_traversable(data_rel_path).is_dir()
+
+
+def is_data_file(data_rel_path: str) -> bool:
+    return _get_data_traversable(data_rel_path).is_file()
+
+
+@contextmanager
+def get_data_path(data_rel_path: str) -> Generator[Path, None, None]:
+    try:
+        with importlib_resources.as_file(_get_data_traversable(data_rel_path)) as path:
+            yield path
+    finally:
+        pass
diff --git a/olmo_data/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json b/olmo_data/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
diff --git a/olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json b/olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "packaging",
     "cached_path>=1.6.2",
     "transformers",
+    "importlib_resources",
 ]
 
 [project.optional-dependencies]
@@ -63,12 +64,13 @@ include-package-data = true
 
 [tool.setuptools.package-data]
 olmo = ["py.typed"]
+olmo_data = ["**"]
 
 [tool.setuptools.dynamic]
 version = { attr = "olmo.version.VERSION" }
 
 [tool.setuptools.packages.find]
-include = ["olmo*", "hf_olmo*"]
+include = ["olmo*", "hf_olmo*", "olmo_data*"]
 exclude = [
     "*.tests",
     "*.tests.*",
diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py
@@ -21,11 +21,10 @@
 
 import torch
 import yaml
+from tokenizers import Tokenizer
 from transformers import OlmoConfig, OlmoForCausalLM
 from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
 
-from tokenizers import Tokenizer
-
 """
 Sample usage:
 ```