Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 8 additions & 21 deletions keras_nlp/models/albert/albert_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_classifier import AlbertClassifier
Expand All @@ -27,26 +26,14 @@
class AlbertClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
sequence_length=5,
AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
),
sequence_length=5,
)
)
self.backbone = AlbertBackbone(
vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
Expand Down
25 changes: 6 additions & 19 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
AlbertMaskedLMPreprocessor,
Expand All @@ -26,24 +25,12 @@

class AlbertMaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
Expand Down
28 changes: 8 additions & 20 deletions keras_nlp/models/albert/albert_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
Expand All @@ -29,25 +28,14 @@
class AlbertMaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertMaskedLMPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
),
sequence_length=5,
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
Expand Down
25 changes: 6 additions & 19 deletions keras_nlp/models/albert/albert_preprocessor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
Expand All @@ -24,24 +23,12 @@

class AlbertPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
# Generated using create_albert_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
Expand Down
43 changes: 13 additions & 30 deletions keras_nlp/models/albert/albert_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
from keras_nlp.tests.test_case import TestCase


class AlbertTokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
# Generated using create_albert_test_proto.py
"proto": os.path.join(
self.get_test_data_dir(), "albert_test_vocab.spm"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
Expand All @@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
AlbertTokenizer(proto=bytes_io.getvalue())
AlbertTokenizer(
# Generated using create_no_special_token_proto.py
proto=os.path.join(
self.get_test_data_dir(), "no_special_token_vocab.spm"
)
)

@pytest.mark.large
def test_smallest_preset(self):
Expand Down
27 changes: 7 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
Expand All @@ -31,25 +30,13 @@
class DebertaV3ClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3Preprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
),
sequence_length=5,
)
self.backbone = DebertaV3Backbone(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
DebertaV3MaskedLMPreprocessor,
Expand All @@ -26,24 +25,12 @@

class DebertaV3MaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
self.tokenizer = DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
)
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
Expand Down
27 changes: 7 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
Expand All @@ -29,25 +28,13 @@
class DebertaV3MaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3MaskedLMPreprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
# Generated using create_deberta_v3_test_proto.py
proto=os.path.join(
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
)
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
Expand Down
Loading