Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Camembert Huggingface bert-like models #3799

Merged
merged 7 commits into from
Mar 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nemo/collections/nlp/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
AlbertEncoder,
BertEncoder,
BertModule,
CamembertEncoder,
DistilBertEncoder,
RobertaEncoder,
SequenceClassifier,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/nlp/modules/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from nemo.collections.nlp.modules.common.huggingface import (
AlbertEncoder,
BertEncoder,
CamembertEncoder,
DistilBertEncoder,
RobertaEncoder,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder
from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder
from nemo.collections.nlp.modules.common.huggingface.camembert import CamembertEncoder
from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder
from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import (
get_huggingface_lm_model,
Expand Down
33 changes: 33 additions & 0 deletions nemo/collections/nlp/modules/common/huggingface/camembert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2020 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import CamembertModel

from nemo.collections.nlp.modules.common.bert_module import BertModule
from nemo.core.classes import typecheck

__all__ = ['CamembertEncoder']


class CamembertEncoder(CamembertModel, BertModule):
"""
Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
"""

@typecheck()
def forward(self, input_ids, token_type_ids, attention_mask):
res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0]
return res
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@
ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
AlbertConfig,
AutoModel,
BertConfig,
CamembertConfig,
DistilBertConfig,
GPT2Config,
RobertaConfig,
)

from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder
from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder
from nemo.collections.nlp.modules.common.huggingface.camembert import CamembertEncoder
from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder
from nemo.collections.nlp.modules.common.huggingface.gpt2 import GPT2Encoder
from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder
Expand All @@ -53,6 +56,12 @@
"config": DistilBertConfig,
"pretrained_model_list": DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
},
"CamembertModel": {
"default": "camembert-base-uncased",
"class": CamembertEncoder,
"config": CamembertConfig,
"pretrained_model_list": CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
},
"RobertaModel": {
"default": "roberta-base",
"class": RobertaEncoder,
Expand All @@ -78,6 +87,7 @@
'RobertaTokenizer': "vocab.json",
'BertTokenizer': "vocab.txt",
'DistilBertTokenizer': "vocab.txt",
'CamembertTokenizer': "sentencepiece.bpe.model",
ekmb marked this conversation as resolved.
Show resolved Hide resolved
"GPT2Tokenizer": "vocab.json",
}

Expand Down Expand Up @@ -127,10 +137,10 @@ def get_huggingface_lm_model(
def get_huggingface_pretrained_lm_models_list(include_external: bool = False,) -> List[str]:
"""
Returns the list of pretrained HuggingFace language models

Args:
include_external if true includes all HuggingFace model names, not only those supported language models in NeMo.

Returns the list of HuggingFace models
"""

Expand Down