Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 10 additions & 21 deletions keras_nlp/models/albert/albert_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_classifier import AlbertClassifier
Expand All @@ -27,26 +26,16 @@
class AlbertClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
sequence_length=5,
AlbertTokenizer(
proto=str(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit of a mouthful. Can we maybe add this to our base class for tests in test_case.py?

proto=os.path.join(self.test_data_dir(), "albert_test_vocab.spm")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "albert_sentencepiece.proto"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe in keeping with our preset suffixes and name, let's call this albert_test_vocab.spm. Let's also drop a comment right above this line, # Generated with create_albert_test_proto.py, so people know how to update this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

),
sequence_length=5,
)
)
self.backbone = AlbertBackbone(
vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
Expand Down
27 changes: 8 additions & 19 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
AlbertMaskedLMPreprocessor,
Expand All @@ -26,24 +25,14 @@

class AlbertMaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "albert_sentencepiece.proto"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
Expand Down
30 changes: 10 additions & 20 deletions keras_nlp/models/albert/albert_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_backbone import AlbertBackbone
from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
Expand All @@ -29,25 +28,16 @@
class AlbertMaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.preprocessor = AlbertMaskedLMPreprocessor(
AlbertTokenizer(proto=bytes_io.getvalue()),
AlbertTokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "albert_sentencepiece.proto"
),
sequence_length=5,
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
Expand Down
27 changes: 8 additions & 19 deletions keras_nlp/models/albert/albert_preprocessor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
Expand All @@ -24,24 +23,14 @@

class AlbertPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
self.tokenizer = AlbertTokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "albert_sentencepiece.proto"
)
)
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
"sequence_length": 8,
Expand Down
47 changes: 17 additions & 30 deletions keras_nlp/models/albert/albert_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
from keras_nlp.tests.test_case import TestCase


class AlbertTokenizerTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
self.init_kwargs = {"proto": bytes_io.getvalue()}
self.init_kwargs = {
"proto": str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "albert_sentencepiece.proto"
)
}
self.input_data = ["the quick brown fox.", "the earth is round."]

def test_tokenizer_basics(self):
Expand All @@ -52,17 +41,15 @@ def test_tokenizer_basics(self):
)

def test_errors_missing_special_tokens(self):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(["abc"]),
model_writer=bytes_io,
vocab_size=5,
pad_id=-1,
eos_id=-1,
bos_id=-1,
)
with self.assertRaises(ValueError):
AlbertTokenizer(proto=bytes_io.getvalue())
AlbertTokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "sentencepiece_bad.proto"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe let's be more specific than "bad" here.

"no_special_token_vocab.spm"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

)
)

@pytest.mark.large
def test_smallest_preset(self):
Expand Down
29 changes: 9 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
Expand All @@ -31,25 +30,15 @@
class DebertaV3ClassifierTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3Preprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "deberta_v3_sentencepiece.proto"
)
),
sequence_length=5,
)
self.backbone = DebertaV3Backbone(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
DebertaV3MaskedLMPreprocessor,
Expand All @@ -26,24 +25,14 @@

class DebertaV3MaskedLMPreprocessorTest(TestCase):
def setUp(self):
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
self.tokenizer = DebertaV3Tokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "deberta_v3_sentencepiece.proto"
)
)
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
self.init_kwargs = {
"tokenizer": self.tokenizer,
# Simplify our testing by masking every available token.
Expand Down
29 changes: 9 additions & 20 deletions keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import pathlib

import pytest
import sentencepiece

from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
Expand All @@ -29,25 +28,15 @@
class DebertaV3MaskedLMTest(TestCase):
def setUp(self):
# Setup model.
vocab_data = ["the quick brown fox", "the earth is round"]
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=iter(vocab_data),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3,
pad_piece="[PAD]",
bos_piece="[CLS]",
eos_piece="[SEP]",
unk_piece="[UNK]",
user_defined_symbols="[MASK]",
)
self.preprocessor = DebertaV3MaskedLMPreprocessor(
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
DebertaV3Tokenizer(
proto=str(
pathlib.Path(__file__).parent.parent.parent
/ "tests"
/ "test_data"
/ "deberta_v3_sentencepiece.proto"
)
),
# Simplify our testing by masking every available token.
mask_selection_rate=1.0,
mask_token_rate=1.0,
Expand Down
Loading