diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py index 66f09cbe202b..cf0081f7bd83 100644 --- a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py +++ b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, Optional +from typing import Dict, Optional import numpy as np @@ -59,6 +59,11 @@ def get_features( for j, word in enumerate(words): word_tokens = tokenizer.text_to_tokens(word) + + # to handle emojis that could be neglected during tokenization + if len(word.strip()) > 0 and len(word_tokens) == 0: + word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] + subtokens.extend(word_tokens) loss_mask.append(1) diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py index d102ff78e0c4..aa1ad0b1107c 100644 --- a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py +++ b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py @@ -87,6 +87,11 @@ def get_features( for j, word in enumerate(words): word_tokens = tokenizer.text_to_tokens(word) + + # to handle emojis that could be neglected during tokenization + if len(word.strip()) > 0 and len(word_tokens) == 0: + word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] + subtokens.extend(word_tokens) loss_mask.append(1) diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py index 1cea91f2ba53..a2b2081e5dee 100644 --- a/nemo/collections/nlp/models/nlp_model.py +++ b/nemo/collections/nlp/models/nlp_model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob import hashlib import json import os @@ -40,7 +39,6 @@ from nemo.core.classes.exportable import Exportable from nemo.core.connectors.save_restore_connector import SaveRestoreConnector from nemo.utils import AppState, logging -from nemo.utils.exp_manager import configure_checkpointing from nemo.utils.get_rank import is_global_rank_zero __all__ = ['NLPModel'] diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py index 5781fbf4189e..ed5d287a50da 100644 --- a/nemo/collections/nlp/models/token_classification/token_classification_model.py +++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py @@ -15,7 +15,6 @@ import os from typing import Dict, List, Optional, Union -import onnx import torch from omegaconf import DictConfig, OmegaConf from pytorch_lightning import Trainer diff --git a/nemo/collections/nlp/modules/common/lm_utils.py b/nemo/collections/nlp/modules/common/lm_utils.py index c0db52b99674..4feb5698e8b1 100644 --- a/nemo/collections/nlp/modules/common/lm_utils.py +++ b/nemo/collections/nlp/modules/common/lm_utils.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from dataclasses import dataclass from typing import List, Optional, Union from attr import asdict @@ -36,7 +35,7 @@ get_megatron_transformer, get_nemo_transformer, ) -from nemo.utils import logging +from nemo.utils import AppState, logging __all__ = ['get_pretrained_lm_models_list', 'get_lm_model'] @@ -102,7 +101,10 @@ def get_lm_model( config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name, ) - if checkpoint_file and os.path.exists(checkpoint_file): + if checkpoint_file: + app_state = AppState() + if not app_state.is_model_being_restored and not os.path.exists(checkpoint_file): + raise ValueError(f'{checkpoint_file} not found') model.restore_weights(restore_path=checkpoint_file) return model diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py index fc974b837042..cd98f7846425 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py @@ -114,7 +114,6 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): graph = delete_space + graph + delete_space self.fst = graph.optimize() - generator_main(far_file, {"tokenize_and_classify": self.fst}) if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst})