Skip to content

Commit 879020a

Browse files
authored
Merge branch 'keras-team:master' into electra
2 parents f812c39 + bd77450 commit 879020a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1152
-481
lines changed

keras_nlp/backend/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
_MULTI_BACKEND = True
6767

6868
# If keras is version 3, use multi-backend keras (our only option).
69-
_IS_KERAS_3 = version.parse(keras.__version__) >= version.parse("3.0.0")
69+
_IS_KERAS_3 = version.parse(keras.__version__) >= version.parse("3.0.0.dev0")
7070
if _IS_KERAS_3:
7171
_MULTI_BACKEND = True
7272

keras_nlp/models/albert/albert_classifier_test.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.albert.albert_backbone import AlbertBackbone
2120
from keras_nlp.models.albert.albert_classifier import AlbertClassifier
@@ -27,26 +26,14 @@
2726
class AlbertClassifierTest(TestCase):
2827
def setUp(self):
2928
# Setup model.
30-
vocab_data = ["the quick brown fox", "the earth is round"]
31-
bytes_io = io.BytesIO()
32-
sentencepiece.SentencePieceTrainer.train(
33-
sentence_iterator=iter(vocab_data),
34-
model_writer=bytes_io,
35-
vocab_size=12,
36-
model_type="WORD",
37-
pad_id=0,
38-
unk_id=1,
39-
bos_id=2,
40-
eos_id=3,
41-
pad_piece="<pad>",
42-
unk_piece="<unk>",
43-
bos_piece="[CLS]",
44-
eos_piece="[SEP]",
45-
user_defined_symbols="[MASK]",
46-
)
4729
self.preprocessor = AlbertPreprocessor(
48-
AlbertTokenizer(proto=bytes_io.getvalue()),
49-
sequence_length=5,
30+
AlbertTokenizer(
31+
# Generated using create_albert_test_proto.py
32+
proto=os.path.join(
33+
self.get_test_data_dir(), "albert_test_vocab.spm"
34+
),
35+
sequence_length=5,
36+
)
5037
)
5138
self.backbone = AlbertBackbone(
5239
vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),

keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
2120
AlbertMaskedLMPreprocessor,
@@ -26,24 +25,12 @@
2625

2726
class AlbertMaskedLMPreprocessorTest(TestCase):
2827
def setUp(self):
29-
vocab_data = ["the quick brown fox", "the earth is round"]
30-
bytes_io = io.BytesIO()
31-
sentencepiece.SentencePieceTrainer.train(
32-
sentence_iterator=iter(vocab_data),
33-
model_writer=bytes_io,
34-
vocab_size=12,
35-
model_type="WORD",
36-
pad_id=0,
37-
unk_id=1,
38-
bos_id=2,
39-
eos_id=3,
40-
pad_piece="<pad>",
41-
unk_piece="<unk>",
42-
bos_piece="[CLS]",
43-
eos_piece="[SEP]",
44-
user_defined_symbols="[MASK]",
28+
self.tokenizer = AlbertTokenizer(
29+
# Generated using create_albert_test_proto.py
30+
proto=os.path.join(
31+
self.get_test_data_dir(), "albert_test_vocab.spm"
32+
)
4533
)
46-
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
4734
self.init_kwargs = {
4835
"tokenizer": self.tokenizer,
4936
# Simplify our testing by masking every available token.

keras_nlp/models/albert/albert_masked_lm_test.py

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.albert.albert_backbone import AlbertBackbone
2120
from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
@@ -29,25 +28,14 @@
2928
class AlbertMaskedLMTest(TestCase):
3029
def setUp(self):
3130
# Setup model.
32-
vocab_data = ["the quick brown fox", "the earth is round"]
33-
bytes_io = io.BytesIO()
34-
sentencepiece.SentencePieceTrainer.train(
35-
sentence_iterator=iter(vocab_data),
36-
model_writer=bytes_io,
37-
vocab_size=12,
38-
model_type="WORD",
39-
pad_id=0,
40-
unk_id=1,
41-
bos_id=2,
42-
eos_id=3,
43-
pad_piece="<pad>",
44-
unk_piece="<unk>",
45-
bos_piece="[CLS]",
46-
eos_piece="[SEP]",
47-
user_defined_symbols="[MASK]",
48-
)
4931
self.preprocessor = AlbertMaskedLMPreprocessor(
50-
AlbertTokenizer(proto=bytes_io.getvalue()),
32+
AlbertTokenizer(
33+
# Generated using create_albert_test_proto.py
34+
proto=os.path.join(
35+
self.get_test_data_dir(), "albert_test_vocab.spm"
36+
),
37+
sequence_length=5,
38+
),
5139
# Simplify our testing by masking every available token.
5240
mask_selection_rate=1.0,
5341
mask_token_rate=1.0,

keras_nlp/models/albert/albert_preprocessor_test.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
2120
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
@@ -24,24 +23,12 @@
2423

2524
class AlbertPreprocessorTest(TestCase):
2625
def setUp(self):
27-
vocab_data = ["the quick brown fox", "the earth is round"]
28-
bytes_io = io.BytesIO()
29-
sentencepiece.SentencePieceTrainer.train(
30-
sentence_iterator=iter(vocab_data),
31-
model_writer=bytes_io,
32-
vocab_size=12,
33-
model_type="WORD",
34-
pad_id=0,
35-
unk_id=1,
36-
bos_id=2,
37-
eos_id=3,
38-
pad_piece="<pad>",
39-
unk_piece="<unk>",
40-
bos_piece="[CLS]",
41-
eos_piece="[SEP]",
42-
user_defined_symbols="[MASK]",
26+
self.tokenizer = AlbertTokenizer(
27+
# Generated using create_albert_test_proto.py
28+
proto=os.path.join(
29+
self.get_test_data_dir(), "albert_test_vocab.spm"
30+
)
4331
)
44-
self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue())
4532
self.init_kwargs = {
4633
"tokenizer": self.tokenizer,
4734
"sequence_length": 8,

keras_nlp/models/albert/albert_tokenizer_test.py

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,35 +12,22 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
2120
from keras_nlp.tests.test_case import TestCase
2221

2322

2423
class AlbertTokenizerTest(TestCase):
2524
def setUp(self):
26-
vocab_data = ["the quick brown fox", "the earth is round"]
27-
bytes_io = io.BytesIO()
28-
sentencepiece.SentencePieceTrainer.train(
29-
sentence_iterator=iter(vocab_data),
30-
model_writer=bytes_io,
31-
vocab_size=12,
32-
model_type="WORD",
33-
pad_id=0,
34-
unk_id=1,
35-
bos_id=2,
36-
eos_id=3,
37-
pad_piece="<pad>",
38-
unk_piece="<unk>",
39-
bos_piece="[CLS]",
40-
eos_piece="[SEP]",
41-
user_defined_symbols="[MASK]",
42-
)
43-
self.init_kwargs = {"proto": bytes_io.getvalue()}
25+
self.init_kwargs = {
26+
# Generated using create_albert_test_proto.py
27+
"proto": os.path.join(
28+
self.get_test_data_dir(), "albert_test_vocab.spm"
29+
)
30+
}
4431
self.input_data = ["the quick brown fox.", "the earth is round."]
4532

4633
def test_tokenizer_basics(self):
@@ -52,17 +39,13 @@ def test_tokenizer_basics(self):
5239
)
5340

5441
def test_errors_missing_special_tokens(self):
55-
bytes_io = io.BytesIO()
56-
sentencepiece.SentencePieceTrainer.train(
57-
sentence_iterator=iter(["abc"]),
58-
model_writer=bytes_io,
59-
vocab_size=5,
60-
pad_id=-1,
61-
eos_id=-1,
62-
bos_id=-1,
63-
)
6442
with self.assertRaises(ValueError):
65-
AlbertTokenizer(proto=bytes_io.getvalue())
43+
AlbertTokenizer(
44+
# Generated using create_no_special_token_proto.py
45+
proto=os.path.join(
46+
self.get_test_data_dir(), "no_special_token_vocab.spm"
47+
)
48+
)
6649

6750
@pytest.mark.large
6851
def test_smallest_preset(self):

keras_nlp/models/backbone.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,14 @@ def from_preset(
112112
if not load_weights:
113113
return model
114114

115+
filename = os.path.basename(metadata["weights_url"])
115116
weights = keras.utils.get_file(
116-
"model.h5",
117+
filename,
117118
metadata["weights_url"],
118119
cache_subdir=os.path.join("models", preset),
119120
file_hash=metadata["weights_hash"],
120121
)
122+
121123
model.load_weights(weights)
122124
return model
123125

keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
2120
from keras_nlp.models.deberta_v3.deberta_v3_classifier import (
@@ -31,25 +30,13 @@
3130
class DebertaV3ClassifierTest(TestCase):
3231
def setUp(self):
3332
# Setup model.
34-
vocab_data = ["the quick brown fox", "the earth is round"]
35-
bytes_io = io.BytesIO()
36-
sentencepiece.SentencePieceTrainer.train(
37-
sentence_iterator=iter(vocab_data),
38-
model_writer=bytes_io,
39-
vocab_size=12,
40-
model_type="WORD",
41-
pad_id=0,
42-
bos_id=1,
43-
eos_id=2,
44-
unk_id=3,
45-
pad_piece="[PAD]",
46-
bos_piece="[CLS]",
47-
eos_piece="[SEP]",
48-
unk_piece="[UNK]",
49-
user_defined_symbols="[MASK]",
50-
)
5133
self.preprocessor = DebertaV3Preprocessor(
52-
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
34+
DebertaV3Tokenizer(
35+
# Generated using create_deberta_v3_test_proto.py
36+
proto=os.path.join(
37+
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
38+
)
39+
),
5340
sequence_length=5,
5441
)
5542
self.backbone = DebertaV3Backbone(

keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import (
2120
DebertaV3MaskedLMPreprocessor,
@@ -26,24 +25,12 @@
2625

2726
class DebertaV3MaskedLMPreprocessorTest(TestCase):
2827
def setUp(self):
29-
vocab_data = ["the quick brown fox", "the earth is round"]
30-
bytes_io = io.BytesIO()
31-
sentencepiece.SentencePieceTrainer.train(
32-
sentence_iterator=iter(vocab_data),
33-
model_writer=bytes_io,
34-
vocab_size=12,
35-
model_type="WORD",
36-
pad_id=0,
37-
bos_id=1,
38-
eos_id=2,
39-
unk_id=3,
40-
pad_piece="[PAD]",
41-
bos_piece="[CLS]",
42-
eos_piece="[SEP]",
43-
unk_piece="[UNK]",
44-
user_defined_symbols="[MASK]",
28+
self.tokenizer = DebertaV3Tokenizer(
29+
# Generated using create_deberta_v3_test_proto.py
30+
proto=os.path.join(
31+
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
32+
)
4533
)
46-
self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue())
4734
self.init_kwargs = {
4835
"tokenizer": self.tokenizer,
4936
# Simplify our testing by masking every available token.

keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import io
15+
import os
1616

1717
import pytest
18-
import sentencepiece
1918

2019
from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone
2120
from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM
@@ -29,25 +28,13 @@
2928
class DebertaV3MaskedLMTest(TestCase):
3029
def setUp(self):
3130
# Setup model.
32-
vocab_data = ["the quick brown fox", "the earth is round"]
33-
bytes_io = io.BytesIO()
34-
sentencepiece.SentencePieceTrainer.train(
35-
sentence_iterator=iter(vocab_data),
36-
model_writer=bytes_io,
37-
vocab_size=12,
38-
model_type="WORD",
39-
pad_id=0,
40-
bos_id=1,
41-
eos_id=2,
42-
unk_id=3,
43-
pad_piece="[PAD]",
44-
bos_piece="[CLS]",
45-
eos_piece="[SEP]",
46-
unk_piece="[UNK]",
47-
user_defined_symbols="[MASK]",
48-
)
4931
self.preprocessor = DebertaV3MaskedLMPreprocessor(
50-
DebertaV3Tokenizer(proto=bytes_io.getvalue()),
32+
DebertaV3Tokenizer(
33+
# Generated using create_deberta_v3_test_proto.py
34+
proto=os.path.join(
35+
self.get_test_data_dir(), "deberta_v3_test_vocab.spm"
36+
)
37+
),
5138
# Simplify our testing by masking every available token.
5239
mask_selection_rate=1.0,
5340
mask_token_rate=1.0,

0 commit comments

Comments
 (0)