fix for emojis (#2675)

* fix for emojis Signed-off-by: ekmb <[email protected]> * remove redundant line Signed-off-by: ekmb <[email protected]> * raise error Signed-off-by: ekmb <[email protected]> * use app_state Signed-off-by: ekmb <[email protected]> Co-authored-by: Eric Harper <[email protected]>
NVIDIA · Aug 19, 2021 · 1f0bf96 · 1f0bf96
1 parent 595dc4d
commit 1f0bf96
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 8 deletions.
diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Dict, Optional
+from typing import Dict, Optional
 
 import numpy as np
 
@@ -59,6 +59,11 @@ def get_features(
 
         for j, word in enumerate(words):
             word_tokens = tokenizer.text_to_tokens(word)
+
+            # to handle emojis that could be neglected during tokenization
+            if len(word.strip()) > 0 and len(word_tokens) == 0:
+                word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]
+
             subtokens.extend(word_tokens)
 
             loss_mask.append(1)

diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py
@@ -87,6 +87,11 @@ def get_features(
 
         for j, word in enumerate(words):
             word_tokens = tokenizer.text_to_tokens(word)
+
+            # to handle emojis that could be neglected during tokenization
+            if len(word.strip()) > 0 and len(word_tokens) == 0:
+                word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]
+
             subtokens.extend(word_tokens)
 
             loss_mask.append(1)

diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import glob
 import hashlib
 import json
 import os
@@ -40,7 +39,6 @@
 from nemo.core.classes.exportable import Exportable
 from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import AppState, logging
-from nemo.utils.exp_manager import configure_checkpointing
 from nemo.utils.get_rank import is_global_rank_zero
 
 __all__ = ['NLPModel']

diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py
@@ -15,7 +15,6 @@
 import os
 from typing import Dict, List, Optional, Union
 
-import onnx
 import torch
 from omegaconf import DictConfig, OmegaConf
 from pytorch_lightning import Trainer

diff --git a/nemo/collections/nlp/modules/common/lm_utils.py b/nemo/collections/nlp/modules/common/lm_utils.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from dataclasses import dataclass
 from typing import List, Optional, Union
 
 from attr import asdict
@@ -36,7 +35,7 @@
     get_megatron_transformer,
     get_nemo_transformer,
 )
-from nemo.utils import logging
+from nemo.utils import AppState, logging
 
 __all__ = ['get_pretrained_lm_models_list', 'get_lm_model']
 
@@ -102,7 +101,10 @@ def get_lm_model(
             config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name,
         )
 
-    if checkpoint_file and os.path.exists(checkpoint_file):
+    if checkpoint_file:
+        app_state = AppState()
+        if not app_state.is_model_being_restored and not os.path.exists(checkpoint_file):
+            raise ValueError(f'{checkpoint_file} not found')
         model.restore_weights(restore_path=checkpoint_file)
 
     return model

diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py
@@ -114,7 +114,6 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
 
             graph = delete_space + graph + delete_space
             self.fst = graph.optimize()
-            generator_main(far_file, {"tokenize_and_classify": self.fst})
 
             if far_file:
                 generator_main(far_file, {"tokenize_and_classify": self.fst})