Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions keras_hub/src/models/backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,19 @@ def load_lora_weights(self, filepath):
layer.lora_kernel_a.assign(lora_kernel_a)
layer.lora_kernel_b.assign(lora_kernel_b)
store.close()

def export_to_transformers(self, path):
"""Export the backbone model to HuggingFace Transformers format.

This saves the backbone's configuration and weights in a format
compatible with HuggingFace Transformers. For unsupported model
architectures, a ValueError is raised.

Args:
path: str. Path to save the exported model.
"""
from keras_hub.src.utils.transformers.export.hf_exporter import (
export_backbone,
)

export_backbone(self, path)
31 changes: 31 additions & 0 deletions keras_hub/src/models/backbone_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from keras_hub.src.models.backbone import Backbone
from keras_hub.src.models.bert.bert_backbone import BertBackbone
from keras_hub.src.models.gemma.gemma_backbone import GemmaBackbone
from keras_hub.src.models.gpt2.gpt2_backbone import GPT2Backbone
from keras_hub.src.tests.test_case import TestCase
from keras_hub.src.utils.preset_utils import CONFIG_FILE
Expand All @@ -15,6 +16,18 @@


class TestBackbone(TestCase):
def setUp(self):
# Common config for backbone instantiation in export tests
self.backbone_config = {
"vocabulary_size": 1000,
"num_layers": 2,
"num_query_heads": 4,
"num_key_value_heads": 1,
"hidden_dim": 512,
"intermediate_dim": 1024,
"head_dim": 128,
}

def test_preset_accessors(self):
bert_presets = set(BertBackbone.presets.keys())
gpt2_presets = set(GPT2Backbone.presets.keys())
Expand Down Expand Up @@ -105,3 +118,21 @@ def test_save_to_preset(self):
ref_out = backbone(data)
new_out = restored_backbone(data)
self.assertAllClose(ref_out, new_out)

def test_export_supported_model(self):
backbone = GemmaBackbone(**self.backbone_config)
export_path = os.path.join(self.get_temp_dir(), "export_backbone")
backbone.export_to_transformers(export_path)
# Basic check: config file exists
self.assertTrue(
os.path.exists(os.path.join(export_path, "config.json"))
)

def test_export_unsupported_model(self):
class UnsupportedBackbone(GemmaBackbone):
pass

backbone = UnsupportedBackbone(**self.backbone_config)
export_path = os.path.join(self.get_temp_dir(), "unsupported")
with self.assertRaises(ValueError):
backbone.export_to_transformers(export_path)
21 changes: 21 additions & 0 deletions keras_hub/src/models/causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,24 @@ def postprocess(x):
outputs = [postprocess(x) for x in outputs]

return self._normalize_generate_outputs(outputs, input_is_scalar)

def export_to_transformers(self, path):
"""Export the full CausalLM model to HuggingFace Transformers format.

This exports the backbone, tokenizer, and configurations in a format
compatible with HuggingFace Transformers. For unsupported model
architectures, a ValueError is raised.

If the preprocessor is attached (default), both the backbone and
tokenizer are exported. To export only the backbone, set
`self.preprocessor = None` before calling this method, then export the,
preprocessor separately via `preprocessor.export_to_transformers(path)`.

Args:
path: str. Path to save the exported model.
"""
from keras_hub.src.utils.transformers.export.hf_exporter import (
export_to_safetensors,
)

export_to_safetensors(self, path)
14 changes: 14 additions & 0 deletions keras_hub/src/models/causal_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,17 @@ def sequence_length(self, value):
self._sequence_length = value
if self.packer is not None:
self.packer.sequence_length = value

def export_to_transformers(self, path):
"""Export the preprocessor (tokenizer) to HuggingFace format.

Args:
path: str. Path to save the exported preprocessor/tokenizer.
"""
if self.tokenizer is None:
raise ValueError("Preprocessor must have a tokenizer for export.")
from keras_hub.src.utils.transformers.export.hf_exporter import (
export_tokenizer,
)

export_tokenizer(self.tokenizer, path)
51 changes: 51 additions & 0 deletions keras_hub/src/models/causal_lm_preprocessor_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import os

import pytest
from sentencepiece import SentencePieceTrainer

from keras_hub.src.models.bert.bert_tokenizer import BertTokenizer
from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
from keras_hub.src.models.gemma.gemma_causal_lm_preprocessor import (
GemmaCausalLMPreprocessor,
)
from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
from keras_hub.src.models.gpt2.gpt2_causal_lm_preprocessor import (
GPT2CausalLMPreprocessor,
)
Expand All @@ -10,6 +17,32 @@


class TestCausalLMPreprocessor(TestCase):
def setUp(self):
# Common setup for export tests
train_sentences = [
"The quick brown fox jumped.",
"I like pizza.",
"This is a test.",
]
self.proto_prefix = os.path.join(self.get_temp_dir(), "dummy_vocab")
SentencePieceTrainer.train(
sentence_iterator=iter(train_sentences),
model_prefix=self.proto_prefix,
vocab_size=290,
model_type="unigram",
pad_id=0,
bos_id=2,
eos_id=1,
unk_id=3,
byte_fallback=True,
pad_piece="<pad>",
bos_piece="<bos>",
eos_piece="<eos>",
unk_piece="<unk>",
user_defined_symbols=["<start_of_turn>", "<end_of_turn>"],
add_dummy_prefix=False,
)

def test_preset_accessors(self):
bert_presets = set(BertTokenizer.presets.keys())
gpt2_presets = set(GPT2Preprocessor.presets.keys())
Expand Down Expand Up @@ -43,3 +76,21 @@ def test_from_preset_errors(self):
with self.assertRaises(ValueError):
# No loading on a non-keras model.
GPT2CausalLMPreprocessor.from_preset("hf://spacy/en_core_web_sm")

def test_export_supported_preprocessor(self):
tokenizer = GemmaTokenizer(proto=f"{self.proto_prefix}.model")
preprocessor = GemmaCausalLMPreprocessor(tokenizer=tokenizer)
export_path = os.path.join(self.get_temp_dir(), "export_preprocessor")
preprocessor.export_to_transformers(export_path)
# Basic check: tokenizer config exists
self.assertTrue(
os.path.exists(os.path.join(export_path, "tokenizer_config.json"))
)

def test_export_missing_tokenizer(self):
preprocessor = GemmaCausalLMPreprocessor(tokenizer=None)
export_path = os.path.join(
self.get_temp_dir(), "export_missing_tokenizer"
)
with self.assertRaises(ValueError):
preprocessor.export_to_transformers(export_path)
109 changes: 109 additions & 0 deletions keras_hub/src/models/task_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
import keras
import numpy as np
import pytest
from sentencepiece import SentencePieceTrainer

from keras_hub.src.models.bert.bert_text_classifier import BertTextClassifier
from keras_hub.src.models.causal_lm import CausalLM
from keras_hub.src.models.gemma.gemma_backbone import GemmaBackbone
from keras_hub.src.models.gemma.gemma_causal_lm import GemmaCausalLM
from keras_hub.src.models.gemma.gemma_causal_lm_preprocessor import (
GemmaCausalLMPreprocessor,
)
from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
from keras_hub.src.models.gpt2.gpt2_causal_lm import GPT2CausalLM
from keras_hub.src.models.image_classifier import ImageClassifier
from keras_hub.src.models.preprocessor import Preprocessor
Expand Down Expand Up @@ -44,6 +51,46 @@ def __init__(self, preprocessor=None, activation=None, **kwargs):


class TestTask(TestCase):
def setUp(self):
# Common setup for export tests
train_sentences = [
"The quick brown fox jumped.",
"I like pizza.",
"This is a test.",
]
self.proto_prefix = os.path.join(self.get_temp_dir(), "dummy_vocab")
SentencePieceTrainer.train(
sentence_iterator=iter(train_sentences),
model_prefix=self.proto_prefix,
vocab_size=290,
model_type="unigram",
pad_id=0,
bos_id=2,
eos_id=1,
unk_id=3,
byte_fallback=True,
pad_piece="<pad>",
bos_piece="<bos>",
eos_piece="<eos>",
unk_piece="<unk>",
user_defined_symbols=["<start_of_turn>", "<end_of_turn>"],
add_dummy_prefix=False,
)
self.tokenizer = GemmaTokenizer(proto=f"{self.proto_prefix}.model")
self.backbone = GemmaBackbone(
vocabulary_size=self.tokenizer.vocabulary_size(),
num_layers=2,
num_query_heads=4,
num_key_value_heads=1,
hidden_dim=512,
intermediate_dim=1024,
head_dim=128,
)
self.preprocessor = GemmaCausalLMPreprocessor(tokenizer=self.tokenizer)
self.causal_lm = GemmaCausalLM(
backbone=self.backbone, preprocessor=self.preprocessor
)

def test_preset_accessors(self):
bert_presets = set(BertTextClassifier.presets.keys())
gpt2_presets = set(GPT2CausalLM.presets.keys())
Expand Down Expand Up @@ -171,3 +218,65 @@ def test_save_to_preset_custom_backbone_and_preprocessor(self):
restored_task = ImageClassifier.from_preset(save_dir)
actual = restored_task.predict(batch)
self.assertAllClose(expected, actual)

def test_export_attached(self):
export_path = os.path.join(self.get_temp_dir(), "export_attached")
self.causal_lm.export_to_transformers(export_path)
# Basic check: config and tokenizer files exist
self.assertTrue(
os.path.exists(os.path.join(export_path, "config.json"))
)
self.assertTrue(
os.path.exists(os.path.join(export_path, "tokenizer_config.json"))
)

def test_export_attached_with_lm_head(self):
# Since attached export always includes lm_head=True, this test verifies
# the same but explicitly notes it for coverage.
export_path = os.path.join(
self.get_temp_dir(), "export_attached_lm_head"
)
self.causal_lm.export_to_transformers(export_path)
# Basic check: config and tokenizer files exist
self.assertTrue(
os.path.exists(os.path.join(export_path, "config.json"))
)
self.assertTrue(
os.path.exists(os.path.join(export_path, "tokenizer_config.json"))
)

def test_export_detached(self):
export_path_backbone = os.path.join(
self.get_temp_dir(), "export_detached_backbone"
)
export_path_preprocessor = os.path.join(
self.get_temp_dir(), "export_detached_preprocessor"
)
original_preprocessor = self.causal_lm.preprocessor
self.causal_lm.preprocessor = None
self.causal_lm.export_to_transformers(export_path_backbone)
self.causal_lm.preprocessor = original_preprocessor
self.preprocessor.export_to_transformers(export_path_preprocessor)
# Basic check: backbone has config, no tokenizer; preprocessor has
# tokenizer config
self.assertTrue(
os.path.exists(os.path.join(export_path_backbone, "config.json"))
)
self.assertFalse(
os.path.exists(
os.path.join(export_path_backbone, "tokenizer_config.json")
)
)
self.assertTrue(
os.path.exists(
os.path.join(export_path_preprocessor, "tokenizer_config.json")
)
)

def test_export_missing_tokenizer(self):
self.preprocessor.tokenizer = None
export_path = os.path.join(
self.get_temp_dir(), "export_missing_tokenizer"
)
with self.assertRaises(ValueError):
self.causal_lm.export_to_transformers(export_path)
15 changes: 15 additions & 0 deletions keras_hub/src/tokenizers/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,18 @@ class like `keras_hub.models.Tokenizer.from_preset()`, or from
if cls.backbone_cls != backbone_cls:
cls = find_subclass(preset, cls, backbone_cls)
return loader.load_tokenizer(cls, config_file, **kwargs)

def export_to_transformers(self, path):
"""Export the tokenizer to HuggingFace Transformers format.

This saves tokenizer assets in a format compatible with HuggingFace
Transformers.

Args:
path: str. Path to save the exported tokenizer.
"""
from keras_hub.src.utils.transformers.export.hf_exporter import (
export_tokenizer,
)

export_tokenizer(self, path)
46 changes: 46 additions & 0 deletions keras_hub/src/tokenizers/tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import pytest
import tensorflow as tf
from absl.testing import parameterized
from sentencepiece import SentencePieceTrainer

from keras_hub.src.models.albert.albert_tokenizer import AlbertTokenizer
from keras_hub.src.models.bert.bert_tokenizer import BertTokenizer
from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
from keras_hub.src.models.gpt2.gpt2_tokenizer import GPT2Tokenizer
from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
from keras_hub.src.tests.test_case import TestCase
Expand All @@ -27,6 +29,32 @@ def detokenize(self, inputs):


class TokenizerTest(TestCase):
def setUp(self):
# Common setup for export tests
train_sentences = [
"The quick brown fox jumped.",
"I like pizza.",
"This is a test.",
]
self.proto_prefix = os.path.join(self.get_temp_dir(), "dummy_vocab")
SentencePieceTrainer.train(
sentence_iterator=iter(train_sentences),
model_prefix=self.proto_prefix,
vocab_size=290,
model_type="unigram",
pad_id=0,
bos_id=2,
eos_id=1,
unk_id=3,
byte_fallback=True,
pad_piece="<pad>",
bos_piece="<bos>",
eos_piece="<eos>",
unk_piece="<unk>",
user_defined_symbols=["<start_of_turn>", "<end_of_turn>"],
add_dummy_prefix=False,
)

def test_preset_accessors(self):
bert_presets = set(BertTokenizer.presets.keys())
gpt2_presets = set(GPT2Tokenizer.presets.keys())
Expand Down Expand Up @@ -113,3 +141,21 @@ def test_save_to_preset(self, cls, preset_name, tokenizer_type):
# Check config class.
tokenizer_config = load_json(save_dir, TOKENIZER_CONFIG_FILE)
self.assertEqual(cls, check_config_class(tokenizer_config))

def test_export_supported_tokenizer(self):
tokenizer = GemmaTokenizer(proto=f"{self.proto_prefix}.model")
export_path = os.path.join(self.get_temp_dir(), "export_tokenizer")
tokenizer.export_to_transformers(export_path)
# Basic check: tokenizer config exists
self.assertTrue(
os.path.exists(os.path.join(export_path, "tokenizer_config.json"))
)

def test_export_unsupported_tokenizer(self):
class UnsupportedTokenizer(GemmaTokenizer):
pass

tokenizer = UnsupportedTokenizer(proto=f"{self.proto_prefix}.model")
export_path = os.path.join(self.get_temp_dir(), "unsupported_tokenizer")
with self.assertRaises(ValueError):
tokenizer.export_to_transformers(export_path)
Loading
Loading